Basic support for hidream i1 model.

Cleanup.
More flexible long clip support.
2025-04-16 08:33:29 +00:00 · 2025-04-15 17:35:05 -04:00 · 2025-04-15 12:13:28 -04:00 · 2025-04-15 10:32:21 -04:00 · 2025-04-14 18:00:33 -04:00 · 2025-04-13 12:21:12 -07:00
310 changed files with 430095 additions and 196554 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@ -28,12 +28,12 @@ def pull(repo, remote_name='origin', branch='master'):

                if repo.index.conflicts is not None:
                    for conflict in repo.index.conflicts:
-                        print('Conflicts found in:', conflict[0].path)
+                        print('Conflicts found in:', conflict[0].path)  # noqa: T201
                    raise AssertionError('Conflicts, ahhhhh!!')

                user = repo.default_signature
                tree = repo.index.write_tree()
-                commit = repo.create_commit('HEAD',
+                repo.create_commit('HEAD',
                                    user,
                                    user,
                                    'Merge!',
@ -49,18 +49,18 @@ repo_path = str(sys.argv[1])
 repo = pygit2.Repository(repo_path)
 ident = pygit2.Signature('comfyui', 'comfy@ui')
 try:
-    print("stashing current changes")
+    print("stashing current changes")  # noqa: T201
    repo.stash(ident)
 except KeyError:
-    print("nothing to stash")
+    print("nothing to stash")  # noqa: T201
 backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
-print("creating backup branch: {}".format(backup_branch_name))
+print("creating backup branch: {}".format(backup_branch_name))  # noqa: T201
 try:
    repo.branches.local.create(backup_branch_name, repo.head.peel())
 except:
    pass

-print("checking out master branch")
+print("checking out master branch")  # noqa: T201
 branch = repo.lookup_branch('master')
 if branch is None:
    ref = repo.lookup_reference('refs/remotes/origin/master')
@ -72,7 +72,7 @@ else:
    ref = repo.lookup_reference(branch.name)
    repo.checkout(ref)

-print("pulling latest changes")
+print("pulling latest changes")  # noqa: T201
 pull(repo)

 if "--stable" in sys.argv:
@ -94,7 +94,7 @@ if "--stable" in sys.argv:
    if latest_tag is not None:
        repo.checkout(latest_tag)

-print("Done!")
+print("Done!")  # noqa: T201

 self_update = True
 if len(sys.argv) > 2:
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -0,0 +1,2 @@
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
+pause
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -3,8 +3,8 @@ name: Python Linting
 on: [push, pull_request]

 jobs:
-  pylint:
-    name: Run Pylint
+  ruff:
+    name: Run Ruff
    runs-on: ubuntu-latest

    steps:
@ -16,8 +16,8 @@ jobs:
      with:
        python-version: 3.x

-    - name: Install Pylint
-      run: pip install pylint
+    - name: Install Ruff
+      run: pip install ruff

-    - name: Run Pylint
-      run: pylint --rcfile=.pylintrc $(find . -type f -name "*.py")
+    - name: Run Ruff
+      run: ruff check .
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -12,7 +12,7 @@ on:
        description: 'CUDA version'
        required: true
        type: string
-        default: "124"
+        default: "126"
      python_minor:
        description: 'Python minor version'
        required: true
@ -22,7 +22,7 @@ on:
        description: 'Python patch version'
        required: true
        type: string
-        default: "7"
+        default: "9"


 jobs:
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@ -20,7 +20,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos, linux, windows]
+        # os: [macos, linux, windows]
+        os: [macos, linux]
        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
@ -31,9 +32,9 @@ jobs:
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
-          - os: windows
-            runner_label: [self-hosted, Windows]
-            flags: ""
+          # - os: windows
+          #   runner_label: [self-hosted, Windows]
+          #   flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
@ -45,28 +46,28 @@ jobs:
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}

-  test-win-nightly:
-    strategy:
-      fail-fast: true
-      matrix:
-        os: [windows]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
-        cuda_version: ["12.1"]
-        torch_version: ["nightly"]
-        include:
-          - os: windows
-            runner_label: [self-hosted, Windows]
-            flags: ""
-    runs-on: ${{ matrix.runner_label }}
-    steps:
-      - name: Test Workflows
-        uses: comfy-org/comfy-action@main
-        with:
-          os: ${{ matrix.os }}
-          python_version: ${{ matrix.python_version }}
-          torch_version: ${{ matrix.torch_version }}
-          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-          comfyui_flags: ${{ matrix.flags }}
+  # test-win-nightly:
+  #   strategy:
+  #     fail-fast: true
+  #     matrix:
+  #       os: [windows]
+  #       python_version: ["3.9", "3.10", "3.11", "3.12"]
+  #       cuda_version: ["12.1"]
+  #       torch_version: ["nightly"]
+  #       include:
+  #         - os: windows
+  #           runner_label: [self-hosted, Windows]
+  #           flags: ""
+  #   runs-on: ${{ matrix.runner_label }}
+  #   steps:
+  #     - name: Test Workflows
+  #       uses: comfy-org/comfy-action@main
+  #       with:
+  #         os: ${{ matrix.os }}
+  #         python_version: ${{ matrix.python_version }}
+  #         torch_version: ${{ matrix.torch_version }}
+  #         google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+  #         comfyui_flags: ${{ matrix.flags }}

  test-unix-nightly:
    strategy:
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@ -17,7 +17,7 @@ jobs:
        path: "ComfyUI"
    - uses: actions/setup-python@v4
      with:
-        python-version: '3.8'
+        python-version: '3.9'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@ -18,7 +18,7 @@ jobs:
    - name: Set up Python      
      uses: actions/setup-python@v4
      with:
-        python-version: '3.10'
+        python-version: '3.12'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
--- a/.github/workflows/update-version.yml
+++ b/.github/workflows/update-version.yml
@ -0,0 +1,58 @@
+name: Update Version File
+
+on:
+  pull_request:
+    paths:
+      - "pyproject.toml"
+    branches:
+      - master
+
+jobs:
+  update-version:
+    runs-on: ubuntu-latest
+    # Don't run on fork PRs
+    if: github.event.pull_request.head.repo.full_name == github.repository
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+
+      - name: Update comfyui_version.py
+        run: |
+          # Read version from pyproject.toml and update comfyui_version.py
+          python -c '
+          import tomllib
+
+          # Read version from pyproject.toml
+          with open("pyproject.toml", "rb") as f:
+              config = tomllib.load(f)
+              version = config["project"]["version"]
+
+          # Write version to comfyui_version.py
+          with open("comfyui_version.py", "w") as f:
+              f.write("# This file is automatically generated by the build process when version is\n")
+              f.write("# updated in pyproject.toml.\n")
+              f.write(f"__version__ = \"{version}\"\n")
+          '
+
+      - name: Commit changes
+        run: |
+          git config --local user.name "github-actions"
+          git config --local user.email "github-actions@github.com"
+          git fetch origin ${{ github.head_ref }}
+          git checkout -B ${{ github.head_ref }} origin/${{ github.head_ref }}
+          git add comfyui_version.py
+          git diff --quiet && git diff --staged --quiet || git commit -m "chore: Update comfyui_version.py to match pyproject.toml"
+          git push origin HEAD:${{ github.head_ref }}
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@ -17,7 +17,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "124"
+        default: "126"

      python_minor:
        description: 'python minor version'
@ -29,7 +29,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "7"
+        default: "9"
 #  push:
 #    branches:
 #      - master
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -7,19 +7,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "124"
+        default: "128"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "12"
+        default: "13"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "4"
+        default: "2"
 #  push:
 #    branches:
 #      - master
@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 0
+            fetch-depth: 30
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@ -74,7 +74,7 @@ jobs:
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z

            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@ -7,7 +7,7 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "124"
+        default: "126"

      python_minor:
        description: 'python minor version'
@ -19,7 +19,7 @@ on:
        description: 'python patch version'
        required: true
        type: string
-        default: "7"
+        default: "9"
 #  push:
 #    branches:
 #      - master
--- a/.pylintrc
+++ b/.pylintrc
@ -1,3 +0,0 @@
-[MESSAGES CONTROL]
-disable=all
-enable=eval-used
--- a/23
+++ b/23
@ -1 +1,24 @@
+# Admins
 * @comfyanonymous
+
+# Note: Github teams syntax cannot be used here as the repo is not owned by Comfy-Org.
+# Inlined the team members for now.
+
+# Maintainers
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+
+# Python web server
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+
+# Node developers
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
+/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 <div align="center">

 # ComfyUI
-**The most powerful and modular diffusion model GUI and backend.**
+**The most powerful and modular visual AI engine and application.**


 [![Website][website-shield]][website-url]
@ -31,17 +31,47 @@
 ![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
 </div>

-This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
-### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
+
+## Get Started
+
+#### [Desktop Application](https://www.comfy.org/download)
+- The easiest way to get started. 
+- Available on Windows & macOS.
+
+#### [Windows Portable Package](#installing)
+- Get the latest commits and completely portable.
+- Available on Windows.
+
+#### [Manual Install](#manual-install-windows-linux)
+Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
+
+## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).

-### [Installing ComfyUI](#installing)

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
- [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
- [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
- [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
+- Image Models
+   - SD1.x, SD2.x,
+   - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
+   - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
+   - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
+   - Pixart Alpha and Sigma
+   - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
+   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
+   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
+   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
+- Video Models
+   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
+   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
+   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
+   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
+   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
+   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
+- 3D Models
+   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
+- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@ -61,9 +91,6 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
- [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@ -101,6 +128,8 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 | `Q`                                 | Toggle visibility of the queue                                                                                     |
 | `H`                                  | Toggle visibility of history                                                                                       |
 | `R`                                  | Refresh graph                                                                                                      |
+| `F`                                  | Show/Hide menu                                                                                                      |
+| `.`                                  | Fit view to selection (Whole graph when nothing is selected)                                                        |
 | Double-Click LMB                   | Open node quick search palette                                                                                     |
 | `Shift` + Drag                       | Move multiple wires at once                                                                                        |
 | `Ctrl` + `Alt` + LMB                   | Disconnect all wires from clicked slot                                                                             |
@ -109,7 +138,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git

 # Installing

-## Windows
+## Windows Portable

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

@ -119,6 +148,8 @@ Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you

 If you have trouble extracting it, right click the file -> properties -> unblock

+If you have a 50 series Blackwell card like a 5090 or 5080 see [this discussion thread](https://github.com/comfyanonymous/ComfyUI/discussions/6643)
+
 #### How do I share models between another UI and ComfyUI?

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
@ -127,9 +158,18 @@ See the [Config file](extra_model_paths.yaml.example) to set the search paths fo

 To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)

+
+## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
+
+You can install and start ComfyUI using comfy-cli:
+```bash
+pip install comfy-cli
+comfy install
+```
+
 ## Manual Install (Windows, Linux)

-Note that some dependencies do not yet support python 3.13 so using 3.12 is recommended.
+python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.

 Git clone this repo.

@ -141,21 +181,45 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4```

-This is the command to install the nightly with ROCm 6.2 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.3 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.2```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3```
+
+### Intel GPUs (Windows and Linux)
+
+(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+  
+1. To install PyTorch nightly, use the following command:
+
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```
+
+2. Launch ComfyUI by running `python main.py`
+
+
+(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
+
+1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below:
+
+```
+conda install libuv
+pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
+```
+
+For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
+
+Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).

 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126```

-This is the command to install pytorch nightly instead which might have performance improvements:
+This is the command to install pytorch nightly instead which supports the new blackwell 50xx series GPUs and might have performance improvements.

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```

 #### Troubleshooting

@ -175,17 +239,6 @@ After this you should have everything installed and can proceed to running Comfy

 ### Others:

-#### Intel GPUs
-
-Intel GPU support is available for all Intel GPUs supported by Intel's Extension for Pytorch (IPEX) with the support requirements listed in the [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) page. Choose your platform and method of install and follow the instructions. The steps are as follows:
-
-1. Start by installing the drivers or kernel listed or newer in the Installation page of IPEX linked above for Windows and Linux if needed.
-1. Follow the instructions to install [Intel's oneAPI Basekit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) for your platform.
-1. Install the packages for IPEX using the instructions provided in the Installation page for your platform.
-1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux and run ComfyUI normally as described above after everything is installed.
-
-Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).
-
 #### Apple Mac silicon

 You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
@ -201,6 +254,23 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

+#### Ascend NPUs
+
+For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
+2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
+3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
+4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
+
+#### Cambricon MLUs
+
+For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
+
+1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
+2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
+3. Launch ComfyUI by running `python main.py`
+
 # Running

 ```python main.py```
@ -256,6 +326,8 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 ## Support and dev channel

+[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
+
 [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).

 See also: [https://www.comfy.org/](https://www.comfy.org/)
@ -272,7 +344,7 @@ For any bugs, issues, or feature requests related to the frontend, please use th

 The new frontend is now the default for ComfyUI. However, please note:

-1. The frontend in the main ComfyUI repository is updated weekly.
+1. The frontend in the main ComfyUI repository is updated fortnightly.
 2. Daily releases are available in the separate frontend repository.

 To use the most up-to-date frontend version:
@ -289,7 +361,7 @@ To use the most up-to-date frontend version:
   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
   ```

-This approach allows you to easily switch between the stable weekly release and the cutting-edge daily updates, or even specific versions for testing purposes.
+This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.

 ### Accessing the Legacy Frontend

@ -306,4 +378,3 @@ This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy
 ### Which GPU should I buy for this?

 [See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI)
-
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@ -1,9 +1,9 @@
 from aiohttp import web
 from typing import Optional
-from folder_paths import models_dir, user_directory, output_directory, folder_names_and_paths
-from api_server.services.file_service import FileService
+from folder_paths import folder_names_and_paths, get_directory_by_type
 from api_server.services.terminal_service import TerminalService
 import app.logger
+import os

 class InternalRoutes:
    '''
@ -15,32 +15,16 @@ class InternalRoutes:
    def __init__(self, prompt_server):
        self.routes: web.RouteTableDef = web.RouteTableDef()
        self._app: Optional[web.Application] = None
-        self.file_service = FileService({
-            "models": models_dir,
-            "user": user_directory,
-            "output": output_directory
-        })
        self.prompt_server = prompt_server
        self.terminal_service = TerminalService(prompt_server)

    def setup_routes(self):
-        @self.routes.get('/files')
-        async def list_files(request):
-            directory_key = request.query.get('directory', '')
-            try:
-                file_list = self.file_service.list_files(directory_key)
-                return web.json_response({"files": file_list})
-            except ValueError as e:
-                return web.json_response({"error": str(e)}, status=400)
-            except Exception as e:
-                return web.json_response({"error": str(e)}, status=500)
-
        @self.routes.get('/logs')
        async def get_logs(request):
            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))

        @self.routes.get('/logs/raw')
-        async def get_logs(request):
+        async def get_raw_logs(request):
            self.terminal_service.update_size()
            return web.json_response({
                "entries": list(app.logger.get_logs()),
@ -67,6 +51,20 @@ class InternalRoutes:
                response[key] = folder_names_and_paths[key][0]
            return web.json_response(response)

+        @self.routes.get('/files/{directory_type}')
+        async def get_files(request: web.Request) -> web.Response:
+            directory_type = request.match_info['directory_type']
+            if directory_type not in ("output", "input", "temp"):
+                return web.json_response({"error": "Invalid directory type"}, status=400)
+
+            directory = get_directory_by_type(directory_type)
+            sorted_files = sorted(
+                (entry for entry in os.scandir(directory) if entry.is_file()),
+                key=lambda entry: -entry.stat().st_mtime
+            )
+            return web.json_response([entry.name for entry in sorted_files], status=200)
+
+
    def get_app(self):
        if self._app is None:
            self._app = web.Application()
--- a/api_server/services/file_service.py
+++ b/api_server/services/file_service.py
@ -1,13 +0,0 @@
-from typing import Dict, List, Optional
-from api_server.utils.file_operations import FileSystemOperations, FileSystemItem
-
-class FileService:
-    def __init__(self, allowed_directories: Dict[str, str], file_system_ops: Optional[FileSystemOperations] = None):
-        self.allowed_directories: Dict[str, str] = allowed_directories
-        self.file_system_ops: FileSystemOperations = file_system_ops or FileSystemOperations()
-
-    def list_files(self, directory_key: str) -> List[FileSystemItem]:
-        if directory_key not in self.allowed_directories:
-            raise ValueError("Invalid directory key")
-        directory_path: str = self.allowed_directories[directory_key]
-        return self.file_system_ops.walk_directory(directory_path)
--- a/app/app_settings.py
+++ b/app/app_settings.py
@ -1,6 +1,7 @@
 import os
 import json
 from aiohttp import web
+import logging


 class AppSettings():
@ -8,11 +9,21 @@ class AppSettings():
        self.user_manager = user_manager

    def get_settings(self, request):
+        try:
            file = self.user_manager.get_request_user_filepath(
-            request, "comfy.settings.json")
+                request,
+                "comfy.settings.json"
+            )
+        except KeyError as e:
+            logging.error("User settings not found.")
+            raise web.HTTPUnauthorized() from e
        if os.path.isfile(file):
+            try:
                with open(file) as f:
                    return json.load(f)
+            except:
+                logging.error(f"The user settings file is corrupted: {file}")
+                return {}
        else:
            return {}

--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import os
+import folder_paths
+import glob
+from aiohttp import web
+import json
+import logging
+from functools import lru_cache
+
+from utils.json_util import merge_json_recursive
+
+
+# Extra locale files to load into main.json
+EXTRA_LOCALE_FILES = [
+    "nodeDefs.json",
+    "commands.json",
+    "settings.json",
+]
+
+
+def safe_load_json_file(file_path: str) -> dict:
+    if not os.path.exists(file_path):
+        return {}
+
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except json.JSONDecodeError:
+        logging.error(f"Error loading {file_path}")
+        return {}
+
+
+class CustomNodeManager:
+    @lru_cache(maxsize=1)
+    def build_translations(self):
+        """Load all custom nodes translations during initialization. Translations are
+        expected to be loaded from `locales/` folder.
+
+        The folder structure is expected to be the following:
+        - custom_nodes/
+            - custom_node_1/
+                - locales/
+                    - en/
+                        - main.json
+                        - commands.json
+                        - settings.json
+
+        returned translations are expected to be in the following format:
+        {
+            "en": {
+                "nodeDefs": {...},
+                "commands": {...},
+                "settings": {...},
+                ...{other main.json keys}
+            }
+        }
+        """
+
+        translations = {}
+
+        for folder in folder_paths.get_folder_paths("custom_nodes"):
+            # Sort glob results for deterministic ordering
+            for custom_node_dir in sorted(glob.glob(os.path.join(folder, "*/"))):
+                locales_dir = os.path.join(custom_node_dir, "locales")
+                if not os.path.exists(locales_dir):
+                    continue
+
+                for lang_dir in glob.glob(os.path.join(locales_dir, "*/")):
+                    lang_code = os.path.basename(os.path.dirname(lang_dir))
+
+                    if lang_code not in translations:
+                        translations[lang_code] = {}
+
+                    # Load main.json
+                    main_file = os.path.join(lang_dir, "main.json")
+                    node_translations = safe_load_json_file(main_file)
+
+                    # Load extra locale files
+                    for extra_file in EXTRA_LOCALE_FILES:
+                        extra_file_path = os.path.join(lang_dir, extra_file)
+                        key = extra_file.split(".")[0]
+                        json_data = safe_load_json_file(extra_file_path)
+                        if json_data:
+                            node_translations[key] = json_data
+
+                    if node_translations:
+                        translations[lang_code] = merge_json_recursive(
+                            translations[lang_code], node_translations
+                        )
+
+        return translations
+
+    def add_routes(self, routes, webapp, loadedModules):
+
+        @routes.get("/workflow_templates")
+        async def get_workflow_templates(request):
+            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
+            files = [
+                file
+                for folder in folder_paths.get_folder_paths("custom_nodes")
+                for file in glob.glob(
+                    os.path.join(folder, "*/example_workflows/*.json")
+                )
+            ]
+            workflow_templates_dict = (
+                {}
+            )  # custom_nodes folder name -> example workflow names
+            for file in files:
+                custom_nodes_name = os.path.basename(
+                    os.path.dirname(os.path.dirname(file))
+                )
+                workflow_name = os.path.splitext(os.path.basename(file))[0]
+                workflow_templates_dict.setdefault(custom_nodes_name, []).append(
+                    workflow_name
+                )
+            return web.json_response(workflow_templates_dict)
+
+        # Serve workflow templates from custom nodes.
+        for module_name, module_dir in loadedModules:
+            workflows_dir = os.path.join(module_dir, "example_workflows")
+            if os.path.exists(workflows_dir):
+                webapp.add_routes(
+                    [
+                        web.static(
+                            "/api/workflow_templates/" + module_name, workflows_dir
+                        )
+                    ]
+                )
+
+        @routes.get("/i18n")
+        async def get_i18n(request):
+            """Returns translations from all custom nodes' locales folders."""
+            return web.json_response(self.build_translations())
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -3,16 +3,69 @@ import argparse
 import logging
 import os
 import re
+import sys
 import tempfile
 import zipfile
+import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
 from typing import TypedDict, Optional
+from importlib.metadata import version

 import requests
 from typing_extensions import NotRequired
+
 from comfy.cli_args import DEFAULT_VERSION_STRING
+import app.logger
+
+# The path to the requirements.txt file
+req_path = Path(__file__).parents[1] / "requirements.txt"
+
+
+def frontend_install_warning_message():
+    """The warning message to display when the frontend version is not up to date."""
+
+    extra = ""
+    if sys.flags.no_user_site:
+        extra = "-s "
+    return f"""
+Please install the updated requirements.txt file by running:
+{sys.executable} {extra}-m pip install -r {req_path}
+
+This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
+
+If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem
+""".strip()
+
+
+def check_frontend_version():
+    """Check if the frontend version is up to date."""
+
+    def parse_version(version: str) -> tuple[int, int, int]:
+        return tuple(map(int, version.split(".")))
+
+    try:
+        frontend_version_str = version("comfyui-frontend-package")
+        frontend_version = parse_version(frontend_version_str)
+        with open(req_path, "r", encoding="utf-8") as f:
+            required_frontend = parse_version(f.readline().split("=")[-1])
+        if frontend_version < required_frontend:
+            app.logger.log_startup_warning(
+                f"""
+________________________________________________________________________
+WARNING WARNING WARNING WARNING WARNING
+
+Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
+
+{frontend_install_warning_message()}
+________________________________________________________________________
+""".strip()
+            )
+        else:
+            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
+    except Exception as e:
+        logging.error(f"Failed to check frontend version: {e}")


 REQUEST_TIMEOUT = 10  # seconds
@ -109,9 +162,28 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:


 class FrontendManager:
-    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")

+    @classmethod
+    def default_frontend_path(cls) -> str:
+        try:
+            import comfyui_frontend_package
+
+            return str(importlib.resources.files(comfyui_frontend_package) / "static")
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********
+
+comfyui-frontend-package is not installed.
+
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+            sys.exit(-1)
+
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@ -132,7 +204,9 @@ class FrontendManager:
        return match_result.group(1), match_result.group(2), match_result.group(3)

    @classmethod
-    def init_frontend_unsafe(cls, version_string: str, provider: Optional[FrontEndProvider] = None) -> str:
+    def init_frontend_unsafe(
+        cls, version_string: str, provider: Optional[FrontEndProvider] = None
+    ) -> str:
        """
        Initializes the frontend for the specified version.

@ -148,17 +222,26 @@ class FrontendManager:
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            return cls.DEFAULT_FRONTEND_PATH
+            check_frontend_version()
+            return cls.default_frontend_path()

        repo_owner, repo_name, version = cls.parse_version_string(version_string)

        if version.startswith("v"):
-            expected_path = str(Path(cls.CUSTOM_FRONTENDS_ROOT) / f"{repo_owner}_{repo_name}" / version.lstrip("v"))
+            expected_path = str(
+                Path(cls.CUSTOM_FRONTENDS_ROOT)
+                / f"{repo_owner}_{repo_name}"
+                / version.lstrip("v")
+            )
            if os.path.exists(expected_path):
-                logging.info(f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}")
+                logging.info(
+                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
+                )
                return expected_path

-        logging.info(f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub...")
+        logging.info(
+            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
+        )

        provider = provider or FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)
@ -201,4 +284,5 @@ class FrontendManager:
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            return cls.DEFAULT_FRONTEND_PATH
+            check_frontend_version()
+            return cls.default_frontend_path()
--- a/app/logger.py
+++ b/app/logger.py
@ -51,7 +51,7 @@ def on_flush(callback):
    if stderr_interceptor is not None:
        stderr_interceptor.on_flush(callback)

-def setup_logger(log_level: str = 'INFO', capacity: int = 300):
+def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool = False):
    global logs
    if logs:
        return
@ -70,4 +70,29 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300):

    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(logging.Formatter("%(message)s"))
+
+    if use_stdout:
+        # Only errors and critical to stderr
+        stream_handler.addFilter(lambda record: not record.levelno < logging.ERROR)
+
+        # Lesser to stdout
+        stdout_handler = logging.StreamHandler(sys.stdout)
+        stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
+        logger.addHandler(stdout_handler)
+
    logger.addHandler(stream_handler)
+
+
+STARTUP_WARNINGS = []
+
+
+def log_startup_warning(msg):
+    logging.warning(msg)
+    STARTUP_WARNINGS.append(msg)
+
+
+def print_startup_warnings():
+    for s in STARTUP_WARNINGS:
+        logging.warning(s)
+    STARTUP_WARNINGS.clear()
--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import os
+import base64
+import json
+import time
+import logging
+import folder_paths
+import glob
+import comfy.utils
+from aiohttp import web
+from PIL import Image
+from io import BytesIO
+from folder_paths import map_legacy, filter_files_extensions, filter_files_content_types
+
+
+class ModelFileManager:
+    def __init__(self) -> None:
+        self.cache: dict[str, tuple[list[dict], dict[str, float], float]] = {}
+
+    def get_cache(self, key: str, default=None) -> tuple[list[dict], dict[str, float], float] | None:
+        return self.cache.get(key, default)
+
+    def set_cache(self, key: str, value: tuple[list[dict], dict[str, float], float]):
+        self.cache[key] = value
+
+    def clear_cache(self):
+        self.cache.clear()
+
+    def add_routes(self, routes):
+        # NOTE: This is an experiment to replace `/models`
+        @routes.get("/experiment/models")
+        async def get_model_folders(request):
+            model_types = list(folder_paths.folder_names_and_paths.keys())
+            folder_black_list = ["configs", "custom_nodes"]
+            output_folders: list[dict] = []
+            for folder in model_types:
+                if folder in folder_black_list:
+                    continue
+                output_folders.append({"name": folder, "folders": folder_paths.get_folder_paths(folder)})
+            return web.json_response(output_folders)
+
+        # NOTE: This is an experiment to replace `/models/{folder}`
+        @routes.get("/experiment/models/{folder}")
+        async def get_all_models(request):
+            folder = request.match_info.get("folder", None)
+            if not folder in folder_paths.folder_names_and_paths:
+                return web.Response(status=404)
+            files = self.get_model_file_list(folder)
+            return web.json_response(files)
+
+        @routes.get("/experiment/models/preview/{folder}/{path_index}/{filename:.*}")
+        async def get_model_preview(request):
+            folder_name = request.match_info.get("folder", None)
+            path_index = int(request.match_info.get("path_index", None))
+            filename = request.match_info.get("filename", None)
+
+            if not folder_name in folder_paths.folder_names_and_paths:
+                return web.Response(status=404)
+
+            folders = folder_paths.folder_names_and_paths[folder_name]
+            folder = folders[0][path_index]
+            full_filename = os.path.join(folder, filename)
+
+            previews = self.get_model_previews(full_filename)
+            default_preview = previews[0] if len(previews) > 0 else None
+            if default_preview is None or (isinstance(default_preview, str) and not os.path.isfile(default_preview)):
+                return web.Response(status=404)
+
+            try:
+                with Image.open(default_preview) as img:
+                    img_bytes = BytesIO()
+                    img.save(img_bytes, format="WEBP")
+                    img_bytes.seek(0)
+                    return web.Response(body=img_bytes.getvalue(), content_type="image/webp")
+            except:
+                return web.Response(status=404)
+
+    def get_model_file_list(self, folder_name: str):
+        folder_name = map_legacy(folder_name)
+        folders = folder_paths.folder_names_and_paths[folder_name]
+        output_list: list[dict] = []
+
+        for index, folder in enumerate(folders[0]):
+            if not os.path.isdir(folder):
+                continue
+            out = self.cache_model_file_list_(folder)
+            if out is None:
+                out = self.recursive_search_models_(folder, index)
+                self.set_cache(folder, out)
+            output_list.extend(out[0])
+
+        return output_list
+
+    def cache_model_file_list_(self, folder: str):
+        model_file_list_cache = self.get_cache(folder)
+
+        if model_file_list_cache is None:
+            return None
+        if not os.path.isdir(folder):
+            return None
+        if os.path.getmtime(folder) != model_file_list_cache[1]:
+            return None
+        for x in model_file_list_cache[1]:
+            time_modified = model_file_list_cache[1][x]
+            folder = x
+            if os.path.getmtime(folder) != time_modified:
+                return None
+
+        return model_file_list_cache
+
+    def recursive_search_models_(self, directory: str, pathIndex: int) -> tuple[list[str], dict[str, float], float]:
+        if not os.path.isdir(directory):
+            return [], {}, time.perf_counter()
+
+        excluded_dir_names = [".git"]
+        # TODO use settings
+        include_hidden_files = False
+
+        result: list[str] = []
+        dirs: dict[str, float] = {}
+
+        for dirpath, subdirs, filenames in os.walk(directory, followlinks=True, topdown=True):
+            subdirs[:] = [d for d in subdirs if d not in excluded_dir_names]
+            if not include_hidden_files:
+                subdirs[:] = [d for d in subdirs if not d.startswith(".")]
+                filenames = [f for f in filenames if not f.startswith(".")]
+
+            filenames = filter_files_extensions(filenames, folder_paths.supported_pt_extensions)
+
+            for file_name in filenames:
+                try:
+                    relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory)
+                    result.append(relative_path)
+                except:
+                    logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.")
+                    continue
+
+            for d in subdirs:
+                path: str = os.path.join(dirpath, d)
+                try:
+                    dirs[path] = os.path.getmtime(path)
+                except FileNotFoundError:
+                    logging.warning(f"Warning: Unable to access {path}. Skipping this path.")
+                    continue
+
+        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()
+
+    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
+        dirname = os.path.dirname(filepath)
+
+        if not os.path.exists(dirname):
+            return []
+
+        basename = os.path.splitext(filepath)[0]
+        match_files = glob.glob(f"{basename}.*", recursive=False)
+        image_files = filter_files_content_types(match_files, "image")
+        safetensors_file = next(filter(lambda x: x.endswith(".safetensors"), match_files), None)
+        safetensors_metadata = {}
+
+        result: list[str | BytesIO] = []
+
+        for filename in image_files:
+            _basename = os.path.splitext(filename)[0]
+            if _basename == basename:
+                result.append(filename)
+            if _basename == f"{basename}.preview":
+                result.append(filename)
+
+        if safetensors_file:
+            safetensors_filepath = os.path.join(dirname, safetensors_file)
+            header = comfy.utils.safetensors_header(safetensors_filepath, max_size=8*1024*1024)
+            if header:
+                safetensors_metadata = json.loads(header)
+        safetensors_images = safetensors_metadata.get("__metadata__", {}).get("ssmd_cover_images", None)
+        if safetensors_images:
+            safetensors_images = json.loads(safetensors_images)
+            for image in safetensors_images:
+                result.append(BytesIO(base64.b64decode(image)))
+
+        return result
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.clear_cache()
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -38,8 +38,8 @@ class UserManager():
        if not os.path.exists(user_directory):
            os.makedirs(user_directory, exist_ok=True)
            if not args.multi_user:
-                print("****** User settings have been changed to be stored on the server instead of browser storage. ******")
-                print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
+                logging.warning("****** User settings have been changed to be stored on the server instead of browser storage. ******")
+                logging.warning("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")

        if args.multi_user:
            if os.path.isfile(self.get_users_file()):
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@ -2,11 +2,9 @@
 #and modified

 import torch
-import torch as th
 import torch.nn as nn

 from ..ldm.modules.diffusionmodules.util import (
-    zero_module,
    timestep_embedding,
 )

@ -162,7 +160,6 @@ class ControlNet(nn.Module):
            if isinstance(self.num_classes, int):
                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
                self.label_emb = nn.Linear(1, time_embed_dim)
            elif self.num_classes == "sequential":
                assert adm_in_channels is not None
@ -415,7 +412,6 @@ class ControlNet(nn.Module):
        out_output = []
        out_middle = []

-        hs = []
        if self.num_classes is not None:
            assert y.shape[0] == x.shape[0]
            emb = emb + self.label_emb(y)
--- a/comfy/cldm/dit_embedder.py
+++ b/comfy/cldm/dit_embedder.py
@ -1,10 +1,8 @@
 import math
 from typing import List, Optional, Tuple

-import numpy as np
 import torch
 import torch.nn as nn
-from einops import rearrange
 from torch import Tensor

 from comfy.ldm.modules.diffusionmodules.mmdit import DismantledBlock, PatchEmbed, VectorEmbedder, TimestepEmbedder, get_2d_sincos_pos_embed_torch
--- a/comfy/cldm/mmdit.py
+++ b/comfy/cldm/mmdit.py
@ -1,5 +1,5 @@
 import torch
-from typing import Dict, Optional
+from typing import Optional
 import comfy.ldm.modules.diffusionmodules.mmdit

 class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -1,7 +1,6 @@
 import argparse
 import enum
 import os
-from typing import Optional
 import comfy.options


@ -43,10 +42,11 @@ parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certific
 parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
 parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")

+parser.add_argument("--base-directory", type=str, default=None, help="Set the ComfyUI base directory for models, custom_nodes, input, output, temp, and user directories.")
 parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
-parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
-parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
+parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory. Overrides --base-directory.")
+parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory). Overrides --base-directory.")
+parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
@ -79,12 +79,14 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
 fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
 fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
+fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

-parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")
+parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")

 class LatentPreviewMethod(enum.Enum):
    NoPreviews = "none"
@ -99,11 +101,14 @@ parser.add_argument("--preview-size", type=int, default=512, help="Sets the maxi
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
+cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")

 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
+attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
+attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")

 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")

@ -120,14 +125,20 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

-parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reverved depending on your OS.")
+parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")


 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
-parser.add_argument("--fast", action="store_true", help="Enable some untested and potentially quality deteriorating optimizations.")
+
+class PerformanceFeature(enum.Enum):
+    Fp16Accumulation = "fp16_accumulation"
+    Fp8MatrixMultiplication = "fp8_matrix_mult"
+    CublasOps = "cublas_ops"
+
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")

 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
@ -139,6 +150,7 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
+parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")

 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
@ -157,13 +169,14 @@ parser.add_argument(
    """,
 )

-def is_valid_directory(path: Optional[str]) -> Optional[str]:
-    """Validate if the given path is a directory."""
-    if path is None:
-        return None
-
+def is_valid_directory(path: str) -> str:
+    """Validate if the given path is a directory, and check permissions."""
+    if not os.path.exists(path):
+        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
+        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
+    if not os.access(path, os.R_OK):
+        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
    return path

 parser.add_argument(
@ -173,7 +186,9 @@ parser.add_argument(
    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
 )

-parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path.")
+parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
+
+parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")

 if comfy.options.args_parsing:
    args = parser.parse_args()
@ -185,3 +200,17 @@ if args.windows_standalone_build:

 if args.disable_auto_launch:
    args.auto_launch = False
+
+if args.force_fp16:
+    args.fp16_unet = True
+
+
+# '--fast' is not provided, use an empty set
+if args.fast is None:
+    args.fast = set()
+# '--fast' is provided with an empty list, enable all optimizations
+elif args.fast == []:
+    args.fast = set(PerformanceFeature)
+# '--fast' is provided with a list of performance features, use that list
+else:
+    args.fast = set(args.fast)
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -97,14 +97,19 @@ class CLIPTextModel_(torch.nn.Module):
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
+        if embeds is not None:
+            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
+        else:
            x = self.embeddings(input_tokens, dtype=dtype)
+
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
+            mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
+
+        causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1)

-        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
        if mask is not None:
            mask += causal_mask
        else:
@ -115,6 +120,9 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

+        if num_tokens is not None:
+            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
+        else:
            pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

@ -203,6 +211,15 @@ class CLIPVision(torch.nn.Module):
            pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

+class LlavaProjector(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, dtype, device, operations):
+        super().__init__()
+        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
+        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
+
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
@ -212,7 +229,16 @@ class CLIPVisionModelProjection(torch.nn.Module):
        else:
            self.visual_projection = lambda a: a

+        if "llava3" == config_dict.get("projector_type", None):
+            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
+        else:
+            self.multi_modal_projector = None
+
    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        return (x[0], x[1], out)
+        projected = None
+        if self.multi_modal_projector is not None:
+            projected = self.multi_modal_projector(x[1])
+
+        return (x[0], x[1], out, projected)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -9,6 +9,7 @@ import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
 import comfy.clip_model
+import comfy.image_encoders.dino2

 class Output:
    def __getitem__(self, key):
@ -34,6 +35,12 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])

+IMAGE_ENCODERS = {
+    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
+    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
+    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
+}
+
 class ClipVisionModel():
    def __init__(self, json_config):
        with open(json_config) as f:
@ -42,10 +49,11 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
+        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()

        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
@ -65,6 +73,7 @@ class ClipVisionModel():
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
+        outputs["mm_projected"] = out[3]
        return outputs

 def convert_to_transformers(sd, prefix):
@ -101,12 +110,21 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+            if embed_shape == 729:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
+            elif embed_shape == 1024:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
+        elif embed_shape == 577:
+            if "multi_modal_projector.linear_1.bias" in sd:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
+            else:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
+    elif "embeddings.patch_embeddings.projection.weight" in sd:
+        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
        return None

--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@ -0,0 +1,19 @@
+{
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 336,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-5,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "projector_type": "llava3",
+  "torch_dtype": "float32"
+}
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@ -0,0 +1,13 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 512,
+  "intermediate_size": 4304,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 16,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/comfy_types/README.md
+++ b/comfy/comfy_types/README.md
@ -5,7 +5,7 @@ This module provides type hinting and concrete convenience types for node develo
 If cloned to the custom_nodes directory of ComfyUI, types can be imported using:

 ```python
-from comfy_types import IO, ComfyNodeABC, CheckLazyMixin
+from comfy.comfy_types import IO, ComfyNodeABC, CheckLazyMixin

 class ExampleNode(ComfyNodeABC):
    @classmethod
--- a/comfy/comfy_types/init.py
+++ b/comfy/comfy_types/init.py
@ -1,6 +1,6 @@
 import torch
 from typing import Callable, Protocol, TypedDict, Optional, List
-from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin
+from .node_typing import IO, InputTypeDict, ComfyNodeABC, CheckLazyMixin, FileLocator


 class UnetApplyFunction(Protocol):
@ -42,4 +42,5 @@ __all__ = [
    InputTypeDict.__name__,
    ComfyNodeABC.__name__,
    CheckLazyMixin.__name__,
+    FileLocator.__name__,
 ]
--- a/comfy/comfy_types/examples/example_nodes.py
+++ b/comfy/comfy_types/examples/example_nodes.py
@ -1,12 +1,12 @@
-from comfy_types import IO, ComfyNodeABC, InputTypeDict
+from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
 from inspect import cleandoc


 class ExampleNode(ComfyNodeABC):
    """An example node that just adds 1 to an input integer.

-    * Requires an IDE configured with analysis paths etc to be worth looking at.
-    * Not intended for use in ComfyUI.
+    * Requires a modern IDE to provide any benefit (detail: an IDE configured with analysis paths etc).
+    * This node is intended as an example for developers only.
    """

    DESCRIPTION = cleandoc(__doc__)
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -2,6 +2,7 @@

 from __future__ import annotations
 from typing import Literal, TypedDict
+from typing_extensions import NotRequired
 from abc import ABC, abstractmethod
 from enum import Enum

@ -26,6 +27,7 @@ class IO(StrEnum):
    BOOLEAN = "BOOLEAN"
    INT = "INT"
    FLOAT = "FLOAT"
+    COMBO = "COMBO"
    CONDITIONING = "CONDITIONING"
    SAMPLER = "SAMPLER"
    SIGMAS = "SIGMAS"
@ -67,20 +69,46 @@ class IO(StrEnum):
        return not (b.issubset(a) or a.issubset(b))


+class RemoteInputOptions(TypedDict):
+    route: str
+    """The route to the remote source."""
+    refresh_button: bool
+    """Specifies whether to show a refresh button in the UI below the widget."""
+    control_after_refresh: Literal["first", "last"]
+    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
+    timeout: int
+    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
+    max_retries: int
+    """The maximum number of retries before aborting the request."""
+    refresh: int
+    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
+
+
+class MultiSelectOptions(TypedDict):
+    placeholder: NotRequired[str]
+    """The placeholder text to display in the multi-select widget when no items are selected."""
+    chip: NotRequired[bool]
+    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
+
+
 class InputTypeOptions(TypedDict):
    """Provides type hinting for the return type of the INPUT_TYPES node function.

    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_datatypes
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
    """

    default: bool | str | float | int | list | tuple
    """The default value of the widget"""
    defaultInput: bool
-    """Defaults to an input slot rather than a widget"""
+    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
+    - defaultInput on required inputs should be dropped.
+    - defaultInput on optional inputs should be replaced with forceInput.
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
+    """
    forceInput: bool
-    """`defaultInput` and also don't allow converting to a widget"""
+    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
    lazy: bool
    """Declares that this input uses lazy evaluation"""
    rawLink: bool
@ -101,7 +129,7 @@ class InputTypeOptions(TypedDict):
    # default: bool
    label_on: str
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_on: str
+    label_off: str
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
@ -113,6 +141,29 @@ class InputTypeOptions(TypedDict):
    # defaultVal: str
    dynamicPrompts: bool
    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
+    # class InputTypeCombo(InputTypeOptions):
+    image_upload: bool
+    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
+    image_folder: Literal["input", "output", "temp"]
+    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
+    """
+    remote: RemoteInputOptions
+    """Specifies the configuration for a remote input.
+    Available after ComfyUI frontend v1.9.7
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
+    control_after_generate: bool
+    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
+    options: NotRequired[list[str | int | float]]
+    """COMBO type only. Specifies the selectable options for the combo widget.
+    Prefer:
+    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
+    Over:
+    [["Option 1", "Option 2", "Option 3"]]
+    """
+    multi_select: NotRequired[MultiSelectOptions]
+    """COMBO type only. Specifies the configuration for a multi-select widget.
+    Available after ComfyUI frontend v1.13.4
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""


 class HiddenInputTypeDict(TypedDict):
@ -133,7 +184,7 @@ class HiddenInputTypeDict(TypedDict):
 class InputTypeDict(TypedDict):
    """Provides type hinting for node INPUT_TYPES.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
    """

    required: dict[str, tuple[IO, InputTypeOptions]]
@ -143,14 +194,14 @@ class InputTypeDict(TypedDict):
    hidden: HiddenInputTypeDict
    """Offers advanced functionality and server-client communication.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
    """


 class ComfyNodeABC(ABC):
    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
    """

    DESCRIPTION: str
@ -167,7 +218,7 @@ class ComfyNodeABC(ABC):
    CATEGORY: str
    """The category of the node, as per the "Add Node" menu.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#category
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
    """
    EXPERIMENTAL: bool
    """Flags a node as experimental, informing users that it may change or not work as expected."""
@ -181,9 +232,9 @@ class ComfyNodeABC(ABC):

        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
        * The ``optional`` key can be added to describe inputs which do not need to be connected.
-        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/essentials/custom_node_more_on_inputs#hidden-inputs
+        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs

-        Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#input-types
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
        """
        return {"required": {}}

@ -198,7 +249,7 @@ class ComfyNodeABC(ABC):

    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#output-node
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
    """
    INPUT_IS_LIST: bool
    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
@ -209,7 +260,7 @@ class ComfyNodeABC(ABC):

    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """
    OUTPUT_IS_LIST: tuple[bool]
    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
@ -227,7 +278,7 @@ class ComfyNodeABC(ABC):
    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
    specifying which outputs which should be so treated.

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_lists#list-processing
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
    """

    RETURN_TYPES: tuple[IO]
@ -237,19 +288,19 @@ class ComfyNodeABC(ABC):

        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-types
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
    """
    RETURN_NAMES: tuple[str]
    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#return-names
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
    """
    OUTPUT_TOOLTIPS: tuple[str]
    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
    FUNCTION: str
    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`

-    Comfy Docs: https://docs.comfy.org/essentials/custom_node_server_overview#function
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
    """


@ -267,8 +318,19 @@ class CheckLazyMixin:
        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).

-        Comfy Docs: https://docs.comfy.org/essentials/custom_node_lazy_evaluation#defining-check-lazy-status
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
        """

        need = [name for name in kwargs if kwargs[name] is None]
        return need
+
+
+class FileLocator(TypedDict):
+    """Provides type hinting for the file location"""
+
+    filename: str
+    """The filename of the file."""
+    subfolder: str
+    """The subfolder of the file."""
+    type: Literal["input", "output", "temp"]
+    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@ -3,9 +3,6 @@ import math
 import comfy.utils


-def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
-    return abs(a*b) // math.gcd(a, b)
-
 class CONDRegular:
    def __init__(self, cond):
        self.cond = cond
@ -46,7 +43,7 @@ class CONDCrossAttn(CONDRegular):
            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
                return False

-            mult_min = lcm(s1[1], s2[1])
+            mult_min = math.lcm(s1[1], s2[1])
            diff = mult_min // min(s1[1], s2[1])
            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
                return False
@ -57,7 +54,7 @@ class CONDCrossAttn(CONDRegular):
        crossattn_max_len = self.cond.shape[1]
        for x in others:
            c = x.cond
-            crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
+            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
            conds.append(c)

        out = []
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -297,7 +297,6 @@ class ControlLoraOps:
    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
        def __init__(self, in_features: int, out_features: int, bias: bool = True,
                    device=None, dtype=None) -> None:
-            factory_kwargs = {'device': device, 'dtype': dtype}
            super().__init__()
            self.in_features = in_features
            self.out_features = out_features
@ -382,7 +381,6 @@ class ControlLora(ControlNet):
        self.control_model.to(comfy.model_management.get_torch_device())
        diffusion_model = model.diffusion_model
        sd = diffusion_model.state_dict()
-        cm = self.control_model.state_dict()

        for k in sd:
            weight = sd[k]
@ -420,10 +418,7 @@ def controlnet_config(sd, model_options={}):
        weight_dtype = comfy.utils.weight_dtype(sd)

        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
-        if weight_dtype is not None:
-            supported_inference_dtypes.append(weight_dtype)
-
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)

    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
@ -691,10 +686,7 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
        if supported_inference_dtypes is None:
            supported_inference_dtypes = [comfy.model_management.unet_dtype()]

-        if weight_dtype is not None:
-            supported_inference_dtypes.append(weight_dtype)
-
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes)
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)

    load_device = comfy.model_management.get_torch_device()

@ -823,7 +815,7 @@ def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
        for i in range(4):
            for j in range(2):
                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
-            prefix_replace["adapter.body.{}.".format(i, j)] = "body.{}.".format(i * 2)
+            prefix_replace["adapter.body.{}.".format(i, )] = "body.{}.".format(i * 2)
        prefix_replace["adapter."] = ""
        t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
    keys = t2i_data.keys()
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
@ -4,105 +4,6 @@ import logging

 # conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py

-# =================#
-# UNet Conversion #
-# =================#
-
-unet_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-    ("input_blocks.0.0.weight", "conv_in.weight"),
-    ("input_blocks.0.0.bias", "conv_in.bias"),
-    ("out.0.weight", "conv_norm_out.weight"),
-    ("out.0.bias", "conv_norm_out.bias"),
-    ("out.2.weight", "conv_out.weight"),
-    ("out.2.bias", "conv_out.bias"),
-]
-
-unet_conversion_map_resnet = [
-    # (stable-diffusion, HF Diffusers)
-    ("in_layers.0", "norm1"),
-    ("in_layers.2", "conv1"),
-    ("out_layers.0", "norm2"),
-    ("out_layers.3", "conv2"),
-    ("emb_layers.1", "time_emb_proj"),
-    ("skip_connection", "conv_shortcut"),
-]
-
-unet_conversion_map_layer = []
-# hardcoded number of downblocks and resnets/attentions...
-# would need smarter logic for other networks.
-for i in range(4):
-    # loop over downblocks/upblocks
-
-    for j in range(2):
-        # loop over resnets/attentions for downblocks
-        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
-        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-
-        if i < 3:
-            # no attention layers in down_blocks.3
-            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
-            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-
-    for j in range(3):
-        # loop over resnets/attentions for upblocks
-        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
-        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-
-        if i > 0:
-            # no attention layers in up_blocks.0
-            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
-            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-
-    if i < 3:
-        # no downsample in down_blocks.3
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
-        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        # no upsample in up_blocks.3
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-
-hf_mid_atn_prefix = "mid_block.attentions.0."
-sd_mid_atn_prefix = "middle_block.1."
-unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-
-for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2 * j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-
-def convert_unet_state_dict(unet_state_dict):
-    # buyer beware: this is a *brittle* function,
-    # and correct output requires that all of these pieces interact in
-    # the exact order in which I have arranged them.
-    mapping = {k: k for k in unet_state_dict.keys()}
-    for sd_name, hf_name in unet_conversion_map:
-        mapping[hf_name] = sd_name
-    for k, v in mapping.items():
-        if "resnets" in k:
-            for sd_part, hf_part in unet_conversion_map_resnet:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    for k, v in mapping.items():
-        for sd_part, hf_part in unet_conversion_map_layer:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-    return new_state_dict
-
-
 # ================#
 # VAE Conversion #
 # ================#
@ -157,16 +58,23 @@ vae_conversion_map_attn = [
 ]


-def reshape_weight_for_sd(w):
+def reshape_weight_for_sd(w, conv3d=False):
    # convert HF linear weights to SD conv2d weights
+    if conv3d:
+        return w.reshape(*w.shape, 1, 1, 1)
+    else:
        return w.reshape(*w.shape, 1, 1)


 def convert_vae_state_dict(vae_state_dict):
    mapping = {k: k for k in vae_state_dict.keys()}
+    conv3d = False
    for k, v in mapping.items():
        for sd_part, hf_part in vae_conversion_map:
            v = v.replace(hf_part, sd_part)
+        if v.endswith(".conv.weight"):
+            if not conv3d and vae_state_dict[k].ndim == 5:
+                conv3d = True
        mapping[k] = v
    for k, v in mapping.items():
        if "attentions" in k:
@ -179,7 +87,7 @@ def convert_vae_state_dict(vae_state_dict):
        for weight_name in weights_to_convert:
            if f"mid.attn_1.{weight_name}.weight" in k:
                logging.debug(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v)
+                new_state_dict[k] = reshape_weight_for_sd(v, conv3d=conv3d)
    return new_state_dict


@ -206,6 +114,7 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
 code2idx = {"q": 0, "k": 1, "v": 2}

+
 # This function exists because at the time of writing torch.cat can't do fp8 with cuda
 def cat_tensors(tensors):
    x = 0
@ -222,6 +131,7 @@ def cat_tensors(tensors):

    return out

+
 def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
    new_state_dict = {}
    capture_qkv_weight = {}
@ -277,5 +187,3 @@ def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):

 def convert_text_enc_state_dict(text_enc_dict):
    return text_enc_dict
-
-
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@ -1,10 +1,10 @@
 #code taken from: https://github.com/wl-zhao/UniPC and modified

 import torch
-import torch.nn.functional as F
 import math
+import logging

-from tqdm.auto import trange, tqdm
+from tqdm.auto import trange


 class NoiseScheduleVP:
@ -475,7 +475,7 @@ class UniPC:
            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)

    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
-        print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        logging.info(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
        ns = self.noise_schedule
        assert order <= len(model_prev_list)

@ -519,7 +519,6 @@ class UniPC:
            A_p = C_inv_p

        if use_corrector:
-            print('using corrector')
            C_inv = torch.linalg.inv(C)
            A_c = C_inv

@ -662,7 +661,7 @@ class UniPC:

            if x_t is None:
                if use_predictor:
-                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
                else:
                    pred_res = 0
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
@ -670,7 +669,7 @@ class UniPC:
            if use_corrector:
                model_t = self.model_fn(x_t, t)
                if D1s is not None:
-                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
                else:
                    corr_res = 0
                D1_t = (model_t - model_prev_0)
@ -704,7 +703,6 @@ class UniPC:
    ):
        # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
        # t_T = self.noise_schedule.T if t_start is None else t_start
-        device = x.device
        steps = len(timesteps) - 1
        if method == 'multistep':
            assert steps >= order
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
@ -1,3 +1,4 @@
+import math
 import torch
 from torch import nn
 from .ldm.modules.attention import CrossAttention
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@ -5,6 +5,7 @@ import math
 import torch
 import numpy as np
 import itertools
+import logging

 if TYPE_CHECKING:
    from comfy.model_patcher import ModelPatcher, PatcherInjection
@ -15,91 +16,132 @@ import comfy.model_management
 import comfy.patcher_extension
 from node_helpers import conditioning_set_values

+# #######################################################################################################
+# Hooks explanation
+# -------------------
+# The purpose of hooks is to allow conds to influence sampling without the need for ComfyUI core code to
+# make explicit special cases like it does for ControlNet and GLIGEN.
+#
+# This is necessary for nodes/features that are intended for use with masked or scheduled conds, or those
+# that should run special code when a 'marked' cond is used in sampling.
+# #######################################################################################################
+
 class EnumHookMode(enum.Enum):
+    '''
+    Priority of hook memory optimization vs. speed, mostly related to WeightHooks.
+
+    MinVram: No caching will occur for any operations related to hooks.
+    MaxSpeed: Excess VRAM (and RAM, once VRAM is sufficiently depleted) will be used to cache hook weights when switching hook groups.
+    '''
    MinVram = "minvram"
    MaxSpeed = "maxspeed"

 class EnumHookType(enum.Enum):
+    '''
+    Hook types, each of which has different expected behavior.
+    '''
    Weight = "weight"
-    Patch = "patch"
    ObjectPatch = "object_patch"
-    AddModels = "add_models"
-    Callbacks = "callbacks"
-    Wrappers = "wrappers"
-    SetInjections = "add_injections"
+    AdditionalModels = "add_models"
+    TransformerOptions = "transformer_options"
+    Injections = "add_injections"

 class EnumWeightTarget(enum.Enum):
    Model = "model"
    Clip = "clip"

+class EnumHookScope(enum.Enum):
+    '''
+    Determines if hook should be limited in its influence over sampling.
+
+    AllConditioning: hook will affect all conds used in sampling.
+    HookedOnly: hook will only affect the conds it was attached to.
+    '''
+    AllConditioning = "all_conditioning"
+    HookedOnly = "hooked_only"
+
+
 class _HookRef:
    pass

-# NOTE: this is an example of how the should_register function should look
-def default_should_register(hook: 'Hook', model: 'ModelPatcher', model_options: dict, target: EnumWeightTarget, registered: list[Hook]):
+
+def default_should_register(hook: Hook, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+    '''Example for how custom_should_register function can look like.'''
    return True


+def create_target_dict(target: EnumWeightTarget=None, **kwargs) -> dict[str]:
+    '''Creates base dictionary for use with Hooks' target param.'''
+    d = {}
+    if target is not None:
+        d['target'] = target
+    d.update(kwargs)
+    return d
+
+
 class Hook:
    def __init__(self, hook_type: EnumHookType=None, hook_ref: _HookRef=None, hook_id: str=None,
-                 hook_keyframe: 'HookKeyframeGroup'=None):
+                 hook_keyframe: HookKeyframeGroup=None, hook_scope=EnumHookScope.AllConditioning):
        self.hook_type = hook_type
+        '''Enum identifying the general class of this hook.'''
        self.hook_ref = hook_ref if hook_ref else _HookRef()
+        '''Reference shared between hook clones that have the same value. Should NOT be modified.'''
        self.hook_id = hook_id
+        '''Optional string ID to identify hook; useful if need to consolidate duplicates at registration time.'''
        self.hook_keyframe = hook_keyframe if hook_keyframe else HookKeyframeGroup()
+        '''Keyframe storage that can be referenced to get strength for current sampling step.'''
+        self.hook_scope = hook_scope
+        '''Scope of where this hook should apply in terms of the conds used in sampling run.'''
        self.custom_should_register = default_should_register
-        self.auto_apply_to_nonpositive = False
+        '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''

    @property
    def strength(self):
        return self.hook_keyframe.strength

-    def initialize_timesteps(self, model: 'BaseModel'):
+    def initialize_timesteps(self, model: BaseModel):
        self.reset()
        self.hook_keyframe.initialize_timesteps(model)

    def reset(self):
        self.hook_keyframe.reset()

-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: Hook = subtype()
+    def clone(self):
+        c: Hook = self.__class__()
        c.hook_type = self.hook_type
        c.hook_ref = self.hook_ref
        c.hook_id = self.hook_id
        c.hook_keyframe = self.hook_keyframe
+        c.hook_scope = self.hook_scope
        c.custom_should_register = self.custom_should_register
-        # TODO: make this do something
-        c.auto_apply_to_nonpositive = self.auto_apply_to_nonpositive
        return c

-    def should_register(self, model: 'ModelPatcher', model_options: dict, target: EnumWeightTarget, registered: list[Hook]):
-        return self.custom_should_register(self, model, model_options, target, registered)
+    def should_register(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        return self.custom_should_register(self, model, model_options, target_dict, registered)

-    def add_hook_patches(self, model: 'ModelPatcher', model_options: dict, target: EnumWeightTarget, registered: list[Hook]):
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
        raise NotImplementedError("add_hook_patches should be defined for Hook subclasses")

-    def on_apply(self, model: 'ModelPatcher', transformer_options: dict[str]):
-        pass
-
-    def on_unapply(self, model: 'ModelPatcher', transformer_options: dict[str]):
-        pass
-
-    def __eq__(self, other: 'Hook'):
+    def __eq__(self, other: Hook):
        return self.__class__ == other.__class__ and self.hook_ref == other.hook_ref

    def __hash__(self):
        return hash(self.hook_ref)

 class WeightHook(Hook):
+    '''
+    Hook responsible for tracking weights to be applied to some model/clip.
+
+    Note, value of hook_scope is ignored and is treated as HookedOnly.
+    '''
    def __init__(self, strength_model=1.0, strength_clip=1.0):
-        super().__init__(hook_type=EnumHookType.Weight)
+        super().__init__(hook_type=EnumHookType.Weight, hook_scope=EnumHookScope.HookedOnly)
        self.weights: dict = None
        self.weights_clip: dict = None
        self.need_weight_init = True
        self._strength_model = strength_model
        self._strength_clip = strength_clip
+        self.hook_scope = EnumHookScope.HookedOnly # this value does not matter for WeightHooks, just for docs

    @property
    def strength_model(self):
@ -109,36 +151,36 @@ class WeightHook(Hook):
    def strength_clip(self):
        return self._strength_clip * self.strength

-    def add_hook_patches(self, model: 'ModelPatcher', model_options: dict, target: EnumWeightTarget, registered: list[Hook]):
-        if not self.should_register(model, model_options, target, registered):
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
            return False
        weights = None
-        if target == EnumWeightTarget.Model:
-            strength = self._strength_model
-        else:
+
+        target = target_dict.get('target', None)
+        if target == EnumWeightTarget.Clip:
            strength = self._strength_clip
+        else:
+            strength = self._strength_model

        if self.need_weight_init:
            key_map = {}
-            if target == EnumWeightTarget.Model:
-                key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
-            else:
+            if target == EnumWeightTarget.Clip:
                key_map = comfy.lora.model_lora_keys_clip(model.model, key_map)
+            else:
+                key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
            weights = comfy.lora.load_lora(self.weights, key_map, log_missing=False)
        else:
-            if target == EnumWeightTarget.Model:
-                weights = self.weights
-            else:
+            if target == EnumWeightTarget.Clip:
                weights = self.weights_clip
-        k = model.add_hook_patches(hook=self, patches=weights, strength_patch=strength)
-        registered.append(self)
+            else:
+                weights = self.weights
+        model.add_hook_patches(hook=self, patches=weights, strength_patch=strength)
+        registered.add(self)
        return True
        # TODO: add logs about any keys that were not applied

-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: WeightHook = super().clone(subtype)
+    def clone(self):
+        c: WeightHook = super().clone()
        c.weights = self.weights
        c.weights_clip = self.weights_clip
        c.need_weight_init = self.need_weight_init
@ -146,127 +188,158 @@ class WeightHook(Hook):
        c._strength_clip = self._strength_clip
        return c

-class PatchHook(Hook):
-    def __init__(self):
-        super().__init__(hook_type=EnumHookType.Patch)
-        self.patches: dict = None
-    
-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: PatchHook = super().clone(subtype)
-        c.patches = self.patches
-        return c
-    # TODO: add functionality
-
 class ObjectPatchHook(Hook):
-    def __init__(self):
+    def __init__(self, object_patches: dict[str]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
        super().__init__(hook_type=EnumHookType.ObjectPatch)
-        self.object_patches: dict = None
+        self.object_patches = object_patches
+        self.hook_scope = hook_scope

-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: ObjectPatchHook = super().clone(subtype)
+    def clone(self):
+        c: ObjectPatchHook = super().clone()
        c.object_patches = self.object_patches
        return c
-    # TODO: add functionality

-class AddModelsHook(Hook):
-    def __init__(self, key: str=None, models: list['ModelPatcher']=None):
-        super().__init__(hook_type=EnumHookType.AddModels)
-        self.key = key
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("ObjectPatchHook is not supported yet in ComfyUI.")
+
+class AdditionalModelsHook(Hook):
+    '''
+    Hook responsible for telling model management any additional models that should be loaded.
+
+    Note, value of hook_scope is ignored and is treated as AllConditioning.
+    '''
+    def __init__(self, models: list[ModelPatcher]=None, key: str=None):
+        super().__init__(hook_type=EnumHookType.AdditionalModels)
        self.models = models
-        self.append_when_same = True
-    
-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: AddModelsHook = super().clone(subtype)
-        c.key = self.key
-        c.models = self.models.copy() if self.models else self.models
-        c.append_when_same = self.append_when_same
-        return c
-    # TODO: add functionality
-
-class CallbackHook(Hook):
-    def __init__(self, key: str=None, callback: Callable=None):
-        super().__init__(hook_type=EnumHookType.Callbacks)
        self.key = key
-        self.callback = callback

-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: CallbackHook = super().clone(subtype)
+    def clone(self):
+        c: AdditionalModelsHook = super().clone()
+        c.models = self.models.copy() if self.models else self.models
        c.key = self.key
-        c.callback = self.callback
-        return c
-    # TODO: add functionality
-
-class WrapperHook(Hook):
-    def __init__(self, wrappers_dict: dict[str, dict[str, dict[str, list[Callable]]]]=None):
-        super().__init__(hook_type=EnumHookType.Wrappers)
-        self.wrappers_dict = wrappers_dict
-
-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: WrapperHook = super().clone(subtype)
-        c.wrappers_dict = self.wrappers_dict
        return c

-    def add_hook_patches(self, model: 'ModelPatcher', model_options: dict, target: EnumWeightTarget, registered: list[Hook]):
-        if not self.should_register(model, model_options, target, registered):
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
            return False
-        add_model_options = {"transformer_options": self.wrappers_dict}
-        comfy.patcher_extension.merge_nested_dicts(model_options, add_model_options, copy_dict1=False)
-        registered.append(self)
+        registered.add(self)
        return True

-class SetInjectionsHook(Hook):
-    def __init__(self, key: str=None, injections: list['PatcherInjection']=None):
-        super().__init__(hook_type=EnumHookType.SetInjections)
+class TransformerOptionsHook(Hook):
+    '''
+    Hook responsible for adding wrappers, callbacks, patches, or anything else related to transformer_options.
+    '''
+    def __init__(self, transformers_dict: dict[str, dict[str, dict[str, list[Callable]]]]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.TransformerOptions)
+        self.transformers_dict = transformers_dict
+        self.hook_scope = hook_scope
+        self._skip_adding = False
+        '''Internal value used to avoid double load of transformer_options when hook_scope is AllConditioning.'''
+
+    def clone(self):
+        c: TransformerOptionsHook = super().clone()
+        c.transformers_dict = self.transformers_dict
+        c._skip_adding = self._skip_adding
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        # NOTE: to_load_options will be used to manually load patches/wrappers/callbacks from hooks
+        self._skip_adding = False
+        if self.hook_scope == EnumHookScope.AllConditioning:
+            add_model_options = {"transformer_options": self.transformers_dict,
+                                 "to_load_options": self.transformers_dict}
+            # skip_adding if included in AllConditioning to avoid double loading
+            self._skip_adding = True
+        else:
+            add_model_options = {"to_load_options": self.transformers_dict}
+        registered.add(self)
+        comfy.patcher_extension.merge_nested_dicts(model_options, add_model_options, copy_dict1=False)
+        return True
+
+    def on_apply_hooks(self, model: ModelPatcher, transformer_options: dict[str]):
+        if not self._skip_adding:
+            comfy.patcher_extension.merge_nested_dicts(transformer_options, self.transformers_dict, copy_dict1=False)
+
+WrapperHook = TransformerOptionsHook
+'''Only here for backwards compatibility, WrapperHook is identical to TransformerOptionsHook.'''
+
+class InjectionsHook(Hook):
+    def __init__(self, key: str=None, injections: list[PatcherInjection]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.Injections)
        self.key = key
        self.injections = injections
+        self.hook_scope = hook_scope

-    def clone(self, subtype: Callable=None):
-        if subtype is None:
-            subtype = type(self)
-        c: SetInjectionsHook = super().clone(subtype)
+    def clone(self):
+        c: InjectionsHook = super().clone()
        c.key = self.key
        c.injections = self.injections.copy() if self.injections else self.injections
        return c

-    def add_hook_injections(self, model: 'ModelPatcher'):
-        # TODO: add functionality
-        pass
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("InjectionsHook is not supported yet in ComfyUI.")

 class HookGroup:
+    '''
+    Stores groups of hooks, and allows them to be queried by type.
+
+    To prevent breaking their functionality, never modify the underlying self.hooks or self._hook_dict vars directly;
+    always use the provided functions on HookGroup.
+    '''
    def __init__(self):
        self.hooks: list[Hook] = []
+        self._hook_dict: dict[EnumHookType, list[Hook]] = {}
+
+    def __len__(self):
+        return len(self.hooks)

    def add(self, hook: Hook):
        if hook not in self.hooks:
            self.hooks.append(hook)
+            self._hook_dict.setdefault(hook.hook_type, []).append(hook)
+
+    def remove(self, hook: Hook):
+        if hook in self.hooks:
+            self.hooks.remove(hook)
+            self._hook_dict[hook.hook_type].remove(hook)
+
+    def get_type(self, hook_type: EnumHookType):
+        return self._hook_dict.get(hook_type, [])

    def contains(self, hook: Hook):
        return hook in self.hooks

+    def is_subset_of(self, other: HookGroup):
+        self_hooks = set(self.hooks)
+        other_hooks = set(other.hooks)
+        return self_hooks.issubset(other_hooks)
+
+    def new_with_common_hooks(self, other: HookGroup):
+        c = HookGroup()
+        for hook in self.hooks:
+            if other.contains(hook):
+                c.add(hook.clone())
+        return c
+
    def clone(self):
        c = HookGroup()
        for hook in self.hooks:
            c.add(hook.clone())
        return c

-    def clone_and_combine(self, other: 'HookGroup'):
+    def clone_and_combine(self, other: HookGroup):
        c = self.clone()
        if other is not None:
            for hook in other.hooks:
                c.add(hook.clone())
        return c

-    def set_keyframes_on_hooks(self, hook_kf: 'HookKeyframeGroup'):
+    def set_keyframes_on_hooks(self, hook_kf: HookKeyframeGroup):
        if hook_kf is None:
            hook_kf = HookKeyframeGroup()
        else:
@ -274,18 +347,11 @@ class HookGroup:
        for hook in self.hooks:
            hook.hook_keyframe = hook_kf

-    def get_dict_repr(self):
-        d: dict[EnumHookType, dict[Hook, None]] = {}
-        for hook in self.hooks:
-            with_type = d.setdefault(hook.hook_type, {})
-            with_type[hook] = None
-        return d
-
    def get_hooks_for_clip_schedule(self):
        scheduled_hooks: dict[WeightHook, list[tuple[tuple[float,float], HookKeyframe]]] = {}
-        for hook in self.hooks:
        # only care about WeightHooks, for now
-            if hook.hook_type == EnumHookType.Weight:
+        for hook in self.get_type(EnumHookType.Weight):
+            hook: WeightHook
            hook_schedule = []
            # if no hook keyframes, assign default value
            if len(hook.hook_keyframe.keyframes) == 0:
@ -335,7 +401,7 @@ class HookGroup:
            hook.reset()

    @staticmethod
-    def combine_all_hooks(hooks_list: list['HookGroup'], require_count=0) -> 'HookGroup':
+    def combine_all_hooks(hooks_list: list[HookGroup], require_count=0) -> HookGroup:
        actual: list[HookGroup] = []
        for group in hooks_list:
            if group is not None:
@ -365,6 +431,12 @@ class HookKeyframe:
        self.start_t = 999999999.9
        self.guarantee_steps = guarantee_steps

+    def get_effective_guarantee_steps(self, max_sigma: torch.Tensor):
+        '''If keyframe starts before current sampling range (max_sigma), treat as 0.'''
+        if self.start_t > max_sigma:
+            return 0
+        return self.guarantee_steps
+
    def clone(self):
        c = HookKeyframe(strength=self.strength,
                         start_percent=self.start_percent, guarantee_steps=self.guarantee_steps)
@ -407,6 +479,12 @@ class HookKeyframeGroup:
        else:
            self._current_keyframe = None

+    def has_guarantee_steps(self):
+        for kf in self.keyframes:
+            if kf.guarantee_steps > 0:
+                return True
+        return False
+
    def has_index(self, index: int):
        return index >= 0 and index < len(self.keyframes)

@ -420,19 +498,20 @@ class HookKeyframeGroup:
        c._set_first_as_current()
        return c

-    def initialize_timesteps(self, model: 'BaseModel'):
+    def initialize_timesteps(self, model: BaseModel):
        for keyframe in self.keyframes:
            keyframe.start_t = model.model_sampling.percent_to_sigma(keyframe.start_percent)

-    def prepare_current_keyframe(self, curr_t: float) -> bool:
+    def prepare_current_keyframe(self, curr_t: float, transformer_options: dict[str, torch.Tensor]) -> bool:
        if self.is_empty():
            return False
        if curr_t == self._curr_t:
            return False
+        max_sigma = torch.max(transformer_options["sample_sigmas"])
        prev_index = self._current_index
        prev_strength = self._current_strength
        # if met guaranteed steps, look for next keyframe in case need to switch
-        if self._current_used_steps >= self._current_keyframe.guarantee_steps:
+        if self._current_used_steps >= self._current_keyframe.get_effective_guarantee_steps(max_sigma):
            # if has next index, loop through and see if need to switch
            if self.has_index(self._current_index+1):
                for i in range(self._current_index+1, len(self.keyframes)):
@ -445,7 +524,7 @@ class HookKeyframeGroup:
                        self._current_keyframe = eval_c
                        self._current_used_steps = 0
                        # if guarantee_steps greater than zero, stop searching for other keyframes
-                        if self._current_keyframe.guarantee_steps > 0:
+                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
                            break
                    # if eval_c is outside the percent range, stop looking further
                    else: break
@ -508,6 +587,17 @@ def get_sorted_list_via_attr(objects: list, attr: str) -> list:
        sorted_list.extend(object_list)
    return sorted_list

+def create_transformer_options_from_hooks(model: ModelPatcher, hooks: HookGroup,  transformer_options: dict[str]=None):
+    # if no hooks or is not a ModelPatcher for sampling, return empty dict
+    if hooks is None or model.is_clip:
+        return {}
+    if transformer_options is None:
+        transformer_options = {}
+    for hook in hooks.get_type(EnumHookType.TransformerOptions):
+        hook: TransformerOptionsHook
+        hook.on_apply_hooks(model, transformer_options)
+    return transformer_options
+
 def create_hook_lora(lora: dict[str, torch.Tensor], strength_model: float, strength_clip: float):
    hook_group = HookGroup()
    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
@ -534,7 +624,7 @@ def create_hook_model_as_lora(weights_model, weights_clip, strength_model: float
    hook.need_weight_init = False
    return hook_group

-def get_patch_weights_from_model(model: 'ModelPatcher', discard_model_sampling=True):
+def get_patch_weights_from_model(model: ModelPatcher, discard_model_sampling=True):
    if model is None:
        return None
    patches_model: dict[str, torch.Tensor] = model.model.state_dict()
@ -546,7 +636,7 @@ def get_patch_weights_from_model(model: 'ModelPatcher', discard_model_sampling=T
    return patches_model

 # NOTE: this function shows how to register weight hooks directly on the ModelPatchers
-def load_hook_lora_for_models(model: 'ModelPatcher', clip: 'CLIP', lora: dict[str, torch.Tensor],
+def load_hook_lora_for_models(model: ModelPatcher, clip: CLIP, lora: dict[str, torch.Tensor],
                              strength_model: float, strength_clip: float):
    key_map = {}
    if model is not None:
@ -575,7 +665,7 @@ def load_hook_lora_for_models(model: 'ModelPatcher', clip: 'CLIP', lora: dict[st
    k1 = set(k1)
    for x in loaded:
        if (x not in k) and (x not in k1):
-            print(f"NOT LOADED {x}")
+            logging.warning(f"NOT LOADED {x}")
    return (new_modelpatcher, new_clip, hook_group)

 def _combine_hooks_from_values(c_dict: dict[str, HookGroup], values: dict[str, HookGroup], cache: dict[tuple[HookGroup, HookGroup], HookGroup]):
@ -598,24 +688,26 @@ def _combine_hooks_from_values(c_dict: dict[str, HookGroup], values: dict[str, H
    else:
        c_dict[hooks_key] = cache[hooks_tuple]

-def conditioning_set_values_with_hooks(conditioning, values={}, append_hooks=True):
+def conditioning_set_values_with_hooks(conditioning, values={}, append_hooks=True,
+                                       cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
    c = []
-    hooks_combine_cache: dict[tuple[HookGroup, HookGroup], HookGroup] = {}
+    if cache is None:
+        cache = {}
    for t in conditioning:
        n = [t[0], t[1].copy()]
        for k in values:
            if append_hooks and k == 'hooks':
-                _combine_hooks_from_values(n[1], values, hooks_combine_cache)
+                _combine_hooks_from_values(n[1], values, cache)
            else:
                n[1][k] = values[k]
        c.append(n)

    return c

-def set_hooks_for_conditioning(cond, hooks: HookGroup, append_hooks=True):
+def set_hooks_for_conditioning(cond, hooks: HookGroup, append_hooks=True, cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
    if hooks is None:
        return cond
-    return conditioning_set_values_with_hooks(cond, {'hooks': hooks}, append_hooks=append_hooks)
+    return conditioning_set_values_with_hooks(cond, {'hooks': hooks}, append_hooks=append_hooks, cache=cache)

 def set_timesteps_for_conditioning(cond, timestep_range: tuple[float,float]):
    if timestep_range is None:
@ -650,9 +742,10 @@ def combine_with_new_conds(conds: list, new_conds: list):
 def set_conds_props(conds: list, strength: float, set_cond_area: str,
                   mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
    final_conds = []
+    cache = {}
    for c in conds:
        # first, apply lora_hook to conditioning, if provided
-        c = set_hooks_for_conditioning(c, hooks, append_hooks=append_hooks)
+        c = set_hooks_for_conditioning(c, hooks, append_hooks=append_hooks, cache=cache)
        # next, apply mask to conditioning
        c = set_mask_for_conditioning(cond=c, mask=mask, strength=strength, set_cond_area=set_cond_area)
        # apply timesteps, if present
@ -664,9 +757,10 @@ def set_conds_props(conds: list, strength: float, set_cond_area: str,
 def set_conds_props_and_combine(conds: list, new_conds: list, strength: float=1.0, set_cond_area: str="default",
                               mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
    combined_conds = []
+    cache = {}
    for c, masked_c in zip(conds, new_conds):
        # first, apply lora_hook to new conditioning, if provided
-        masked_c = set_hooks_for_conditioning(masked_c, hooks, append_hooks=append_hooks)
+        masked_c = set_hooks_for_conditioning(masked_c, hooks, append_hooks=append_hooks, cache=cache)
        # next, apply mask to new conditioning, if provided
        masked_c = set_mask_for_conditioning(cond=masked_c, mask=mask, set_cond_area=set_cond_area, strength=strength)
        # apply timesteps, if present
@ -678,9 +772,10 @@ def set_conds_props_and_combine(conds: list, new_conds: list, strength: float=1.
 def set_default_conds_and_combine(conds: list, new_conds: list,
                                   hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
    combined_conds = []
+    cache = {}
    for c, new_c in zip(conds, new_conds):
        # first, apply lora_hook to new conditioning, if provided
-        new_c = set_hooks_for_conditioning(new_c, hooks, append_hooks=append_hooks)
+        new_c = set_hooks_for_conditioning(new_c, hooks, append_hooks=append_hooks, cache=cache)
        # next, add default_cond key to cond so that during sampling, it can be identified
        new_c = conditioning_set_values(new_c, {'default': True})
        # apply timesteps, if present
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@ -0,0 +1,141 @@
+import torch
+from comfy.text_encoders.bert import BertAttention
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+
+class Dino2AttentionOutput(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.dense(x)
+
+
+class Dino2AttentionBlock(torch.nn.Module):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
+        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        return self.output(self.attention(x, mask, optimized_attention))
+
+
+class LayerScale(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+
+    def forward(self, x):
+        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
+
+
+class SwiGLUFFN(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        in_features = out_features = dim
+        hidden_features = int(dim * 4)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
+        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.weights_in(x)
+        x1, x2 = x.chunk(2, dim=-1)
+        x = torch.nn.functional.silu(x1) * x2
+        return self.weights_out(x)
+
+
+class Dino2Block(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
+        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
+        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
+        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, x, optimized_attention):
+        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
+        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
+        return x
+
+
+class Dino2Encoder(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
+
+    def forward(self, x, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layer) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layer):
+            x = l(x, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+
+class Dino2PatchEmbeddings(torch.nn.Module):
+    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.projection = operations.Conv2d(
+            in_channels=num_channels,
+            out_channels=dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=True,
+            dtype=dtype,
+            device=device
+        )
+
+    def forward(self, pixel_values):
+        return self.projection(pixel_values).flatten(2).transpose(1, 2)
+
+
+class Dino2Embeddings(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        patch_size = 14
+        image_size = 518
+
+        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
+        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
+        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
+        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+
+    def forward(self, pixel_values):
+        x = self.patch_embeddings(pixel_values)
+        # TODO: mask_token?
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
+        return x
+
+
+class Dinov2Model(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        num_layers = config_dict["num_hidden_layers"]
+        dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        layer_norm_eps = config_dict["layer_norm_eps"]
+
+        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
+        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        x = self.embeddings(pixel_values)
+        x, i = self.encoder(x, intermediate_output=intermediate_output)
+        x = self.layernorm(x)
+        pooled_output = x[:, 0, :]
+        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@ -0,0 +1,21 @@
+{
+  "attention_probs_dropout_prob": 0.0,
+  "drop_path_rate": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1536,
+  "image_size": 518,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-06,
+  "layerscale_value": 1.0,
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_attention_heads": 24,
+  "num_channels": 3,
+  "num_hidden_layers": 40,
+  "patch_size": 14,
+  "qkv_bias": true,
+  "use_swiglu_ffn": true,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std": [0.229, 0.224, 0.225]
+}
--- a/comfy/k_diffusion/deis.py
+++ b/comfy/k_diffusion/deis.py
@ -11,7 +11,6 @@ import numpy as np
 # Transfer from the input time (sigma) used in EDM to that (t) used in DEIS.

 def edm2t(edm_steps, epsilon_s=1e-3, sigma_min=0.002, sigma_max=80):
-    vp_sigma = lambda beta_d, beta_min: lambda t: (np.e ** (0.5 * beta_d * (t ** 2) + beta_min * t) - 1) ** 0.5
    vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * (sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d
    vp_beta_d = 2 * (np.log(torch.tensor(sigma_min).cpu() ** 2 + 1) / epsilon_s - np.log(torch.tensor(sigma_max).cpu() ** 2 + 1)) / (epsilon_s - 1)
    vp_beta_min = np.log(torch.tensor(sigma_max).cpu() ** 2 + 1) - 0.5 * vp_beta_d
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -40,7 +40,7 @@ def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
 def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    """Constructs a continuous VP noise schedule."""
    t = torch.linspace(1, eps_s, n, device=device)
-    sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
+    sigmas = torch.sqrt(torch.special.expm1(beta_d * t ** 2 / 2 + beta_min * t))
    return append_zero(sigmas)


@ -70,8 +70,14 @@ def get_ancestral_step(sigma_from, sigma_to, eta=1.):
    return sigma_down, sigma_up


-def default_noise_sampler(x):
-    return lambda sigma, sigma_next: torch.randn_like(x)
+def default_noise_sampler(x, seed=None):
+    if seed is not None:
+        generator = torch.Generator(device=x.device)
+        generator.manual_seed(seed)
+    else:
+        generator = None
+
+    return lambda sigma, sigma_next: torch.randn(x.size(), dtype=x.dtype, layout=x.layout, device=x.device, generator=generator)


 class BatchedBrownianTree:
@ -168,7 +174,8 @@ def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, dis
        return sample_euler_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -189,7 +196,8 @@ def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, dis
 def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1.0, s_noise=1., noise_sampler=None):
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -290,7 +298,8 @@ def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, dis

    """Ancestral sampling with DPM-Solver second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -318,7 +327,8 @@ def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, dis
 def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -465,7 +475,7 @@ class DPMSolver(nn.Module):
        return x_3, eps_cache

    def dpm_solver_fast(self, x, t_start, t_end, nfe, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+        noise_sampler = default_noise_sampler(x, seed=self.extra_args.get("seed", None)) if noise_sampler is None else noise_sampler
        if not t_end > t_start and eta:
            raise ValueError('eta must be 0 for reverse sampling')

@ -504,7 +514,7 @@ class DPMSolver(nn.Module):
        return x

    def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+        noise_sampler = default_noise_sampler(x, seed=self.extra_args.get("seed", None)) if noise_sampler is None else noise_sampler
        if order not in {2, 3}:
            raise ValueError('order should be 2 or 3')
        forward = t_end > t_start
@ -591,7 +601,8 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,

    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@ -625,7 +636,8 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
 def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
@ -676,10 +688,10 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
    if len(sigmas) <= 1:
        return x

+    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@ -750,10 +762,10 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')

+    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    old_denoised = None
@ -796,10 +808,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x

+    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
-    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    denoised_1, denoised_2 = None, None
@ -846,7 +858,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-
+    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@ -855,7 +867,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-
+    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@ -864,7 +876,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-
+    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@ -882,7 +894,8 @@ def DDPMSampler_step(x, sigma, sigma_prev, noise, noise_sampler):

 def generic_step_sampler(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, step_function=None):
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])

    for i in trange(len(sigmas) - 1, disable=disable):
@ -902,7 +915,8 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
@torch.no_grad()
 def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -1153,7 +1167,8 @@ def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler

    temp = [0]
    def post_cfg_function(args):
@ -1179,7 +1194,8 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
 def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler

    temp = [0]
    def post_cfg_function(args):
@ -1249,3 +1265,258 @@ def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, dis
        x = denoised + denoised_mix + torch.exp(-h) * x
        old_uncond_denoised = uncond_denoised
    return x
+
+@torch.no_grad()
+def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, eta=1., cfg_pp=False):
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+    sigma_fn = lambda t: t.neg().exp()
+    t_fn = lambda sigma: sigma.log().neg()
+    phi1_fn = lambda t: torch.expm1(t) / t
+    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t
+
+    old_denoised = None
+    uncond_denoised = None
+    def post_cfg_function(args):
+        nonlocal uncond_denoised
+        uncond_denoised = args["uncond_denoised"]
+        return args["denoised"]
+
+    if cfg_pp:
+        model_options = extra_args.get("model_options", {}).copy()
+        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
+        if callback is not None:
+            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
+        if sigma_down == 0 or old_denoised is None:
+            # Euler method
+            if cfg_pp:
+                d = to_d(x, sigmas[i], uncond_denoised)
+                x = denoised + d * sigma_down
+            else:
+                d = to_d(x, sigmas[i], denoised)
+                dt = sigma_down - sigmas[i]
+                x = x + d * dt
+        else:
+            # Second order multistep method in https://arxiv.org/pdf/2308.02157
+            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigma_down), t_fn(sigmas[i - 1])
+            h = t_next - t
+            c2 = (t_prev - t) / h
+
+            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
+            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
+            b2 = torch.nan_to_num(phi2_val / c2, nan=0.0)
+
+            if cfg_pp:
+                x = x + (denoised - uncond_denoised)
+                x = sigma_fn(h) * x + h * (b1 * uncond_denoised + b2 * old_denoised)
+            else:
+                x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised)
+
+        # Noise addition
+        if sigmas[i + 1] > 0:
+            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+
+        if cfg_pp:
+            old_denoised = uncond_denoised
+        else:
+            old_denoised = denoised
+    return x
+
+@torch.no_grad()
+def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=False)
+
+@torch.no_grad()
+def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=True)
+
+@torch.no_grad()
+def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=False)
+
+@torch.no_grad()
+def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
+    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
+
+@torch.no_grad()
+def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
+    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    old_d = None
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        d = to_d(x, sigmas[i], denoised)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        dt = sigmas[i + 1] - sigmas[i]
+        if i == 0:
+            # Euler method
+            x = x + d * dt
+        else:
+            # Gradient estimation
+            d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
+            x = x + d_bar * dt
+        old_d = d
+    return x
+
+@torch.no_grad()
+def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
+    """
+    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
+    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
+    """
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    def default_noise_scaler(sigma):
+        return sigma * ((sigma ** 0.3).exp() + 10.0)
+    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
+    num_integration_points = 200.0
+    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
+
+    old_denoised = None
+    old_denoised_d = None
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        stage_used = min(max_stage, i + 1)
+        if sigmas[i + 1] == 0:
+            x = denoised
+        elif stage_used == 1:
+            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
+            x = r * x + (1 - r) * denoised
+        else:
+            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
+            x = r * x + (1 - r) * denoised
+
+            dt = sigmas[i + 1] - sigmas[i]
+            sigma_step_size = -dt / num_integration_points
+            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
+            scaled_pos = noise_scaler(sigma_pos)
+
+            # Stage 2
+            s = torch.sum(1 / scaled_pos) * sigma_step_size
+            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
+            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
+
+            if stage_used >= 3:
+                # Stage 3
+                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
+                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
+                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
+            old_denoised_d = denoised_d
+
+        if s_noise != 0 and sigmas[i + 1] > 0:
+            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
+        old_denoised = denoised
+    return x
+
+@torch.no_grad()
+def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
+    '''
+    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
+    Arxiv: https://arxiv.org/abs/2305.14267
+    '''
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    inject_noise = eta > 0 and s_noise > 0
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigmas[i + 1] == 0:
+            x = denoised
+        else:
+            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = t_next - t
+            h_eta = h * (eta + 1)
+            s = t + r * h
+            fac = 1 / (2 * r)
+            sigma_s = s.neg().exp()
+
+            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
+            if inject_noise:
+                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
+                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
+                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
+
+            # Step 1
+            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
+            if inject_noise:
+                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
+            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
+
+            # Step 2
+            denoised_d = (1 - fac) * denoised + fac * denoised_2
+            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
+            if inject_noise:
+                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
+    return x
+
+@torch.no_grad()
+def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
+    '''
+    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
+    Arxiv: https://arxiv.org/abs/2305.14267
+    '''
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    inject_noise = eta > 0 and s_noise > 0
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigmas[i + 1] == 0:
+            x = denoised
+        else:
+            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = t_next - t
+            h_eta = h * (eta + 1)
+            s_1 = t + r_1 * h
+            s_2 = t + r_2 * h
+            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
+
+            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
+            if inject_noise:
+                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
+                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
+                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
+                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
+
+            # Step 1
+            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
+            if inject_noise:
+                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
+            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
+
+            # Step 2
+            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
+            if inject_noise:
+                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
+            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
+
+            # Step 3
+            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
+            if inject_noise:
+                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
+    return x
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -3,6 +3,7 @@ import torch
 class LatentFormat:
    scale_factor = 1.0
    latent_channels = 4
+    latent_dimensions = 2
    latent_rgb_factors = None
    latent_rgb_factors_bias = None
    taesd_decoder_name = None
@ -143,6 +144,7 @@ class SD3(LatentFormat):

 class StableAudio1(LatentFormat):
    latent_channels = 64
+    latent_dimensions = 1

 class Flux(SD3):
    latent_channels = 16
@ -178,6 +180,7 @@ class Flux(SD3):

 class Mochi(LatentFormat):
    latent_channels = 12
+    latent_dimensions = 3

    def __init__(self):
        self.scale_factor = 1.0
@ -219,6 +222,8 @@ class Mochi(LatentFormat):

 class LTXV(LatentFormat):
    latent_channels = 128
+    latent_dimensions = 3
+
    def __init__(self):
        self.latent_rgb_factors = [
            [ 1.1202e-02, -6.3815e-04, -1.0021e-02],
@ -352,3 +357,112 @@ class LTXV(LatentFormat):
        ]

        self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
+
+class HunyuanVideo(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+    scale_factor = 0.476986
+    latent_rgb_factors = [
+        [-0.0395, -0.0331,  0.0445],
+        [ 0.0696,  0.0795,  0.0518],
+        [ 0.0135, -0.0945, -0.0282],
+        [ 0.0108, -0.0250, -0.0765],
+        [-0.0209,  0.0032,  0.0224],
+        [-0.0804, -0.0254, -0.0639],
+        [-0.0991,  0.0271, -0.0669],
+        [-0.0646, -0.0422, -0.0400],
+        [-0.0696, -0.0595, -0.0894],
+        [-0.0799, -0.0208, -0.0375],
+        [ 0.1166,  0.1627,  0.0962],
+        [ 0.1165,  0.0432,  0.0407],
+        [-0.2315, -0.1920, -0.1355],
+        [-0.0270,  0.0401, -0.0821],
+        [-0.0616, -0.0997, -0.0727],
+        [ 0.0249, -0.0469, -0.1703]
+    ]
+
+    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+
+class Cosmos1CV8x8x8(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+
+    latent_rgb_factors = [
+        [ 0.1817,  0.2284,  0.2423],
+        [-0.0586, -0.0862, -0.3108],
+        [-0.4703, -0.4255, -0.3995],
+        [ 0.0803,  0.1963,  0.1001],
+        [-0.0820, -0.1050,  0.0400],
+        [ 0.2511,  0.3098,  0.2787],
+        [-0.1830, -0.2117, -0.0040],
+        [-0.0621, -0.2187, -0.0939],
+        [ 0.3619,  0.1082,  0.1455],
+        [ 0.3164,  0.3922,  0.2575],
+        [ 0.1152,  0.0231, -0.0462],
+        [-0.1434, -0.3609, -0.3665],
+        [ 0.0635,  0.1471,  0.1680],
+        [-0.3635, -0.1963, -0.3248],
+        [-0.1865,  0.0365,  0.2346],
+        [ 0.0447,  0.0994,  0.0881]
+    ]
+
+    latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
+
+class Wan21(LatentFormat):
+    latent_channels = 16
+    latent_dimensions = 3
+
+    latent_rgb_factors = [
+            [-0.1299, -0.1692,  0.2932],
+            [ 0.0671,  0.0406,  0.0442],
+            [ 0.3568,  0.2548,  0.1747],
+            [ 0.0372,  0.2344,  0.1420],
+            [ 0.0313,  0.0189, -0.0328],
+            [ 0.0296, -0.0956, -0.0665],
+            [-0.3477, -0.4059, -0.2925],
+            [ 0.0166,  0.1902,  0.1975],
+            [-0.0412,  0.0267, -0.1364],
+            [-0.1293,  0.0740,  0.1636],
+            [ 0.0680,  0.3019,  0.1128],
+            [ 0.0032,  0.0581,  0.0639],
+            [-0.1251,  0.0927,  0.1699],
+            [ 0.0060, -0.0633,  0.0005],
+            [ 0.3477,  0.2275,  0.2950],
+            [ 0.1984,  0.0913,  0.1861]
+        ]
+
+    latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
+
+    def __init__(self):
+        self.scale_factor = 1.0
+        self.latents_mean = torch.tensor([
+            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
+            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
+        ]).view(1, self.latent_channels, 1, 1, 1)
+        self.latents_std = torch.tensor([
+            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
+            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
+        ]).view(1, self.latent_channels, 1, 1, 1)
+
+
+        self.taesd_decoder_name = None #TODO
+
+    def process_in(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return (latent - latents_mean) * self.scale_factor / latents_std
+
+    def process_out(self, latent):
+        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
+        latents_std = self.latents_std.to(latent.device, latent.dtype)
+        return latent * latents_std / self.scale_factor + latents_mean
+
+class Hunyuan3Dv2(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+    scale_factor = 0.9990943042622529
+
+class Hunyuan3Dv2mini(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 1
+    scale_factor = 1.0188137142395404
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@ -2,7 +2,7 @@

 import torch
 from torch import nn
-from typing import Literal, Dict, Any
+from typing import Literal
 import math
 import comfy.ops
 ops = comfy.ops.disable_weight_init
@ -97,7 +97,7 @@ def get_activation(activation: Literal["elu", "snake", "none"], antialias=False,
        raise ValueError(f"Unknown activation {activation}")

    if antialias:
-        act = Activation1d(act)
+        act = Activation1d(act)  # noqa: F821 Activation1d is not defined

    return act

--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -158,7 +158,6 @@ class RotaryEmbedding(nn.Module):
    def forward(self, t):
        # device = self.inv_freq.device
        device = t.device
-        dtype = t.dtype

        # t = t.to(torch.float32)

@ -170,7 +169,7 @@ class RotaryEmbedding(nn.Module):
        if self.scale is None:
            return freqs, 1.

-        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
+        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base  # noqa: F821 seq_len is not defined
        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
        scale = torch.cat((scale, scale), dim = -1)

@ -229,9 +228,9 @@ class FeedForward(nn.Module):
            linear_in = GLU(dim, inner_dim, activation, dtype=dtype, device=device, operations=operations)
        else:
            linear_in = nn.Sequential(
-                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                rearrange('b n d -> b d n') if use_conv else nn.Identity(),
                operations.Linear(dim, inner_dim, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(dim, inner_dim, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device),
-                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                rearrange('b n d -> b d n') if use_conv else nn.Identity(),
                activation
            )

@ -246,9 +245,9 @@ class FeedForward(nn.Module):

        self.ff = nn.Sequential(
            linear_in,
-            Rearrange('b d n -> b n d') if use_conv else nn.Identity(),
+            rearrange('b d n -> b n d') if use_conv else nn.Identity(),
            linear_out,
-            Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+            rearrange('b n d -> b d n') if use_conv else nn.Identity(),
        )

    def forward(self, x):
@ -346,18 +345,13 @@ class Attention(nn.Module):

        # determine masking
        masks = []
-        final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account

        if input_mask is not None:
            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
            masks.append(~input_mask)

        # Other masks will be added here later
-
-        if len(masks) > 0:
-            final_attn_mask = ~or_reduce(masks)
-
-        n, device = q.shape[-2], q.device
+        n = q.shape[-2]

        causal = self.causal if causal is None else causal

--- a/comfy/ldm/audio/embedders.py
+++ b/comfy/ldm/audio/embedders.py
@ -2,8 +2,8 @@

 import torch
 import torch.nn as nn
-from torch import Tensor, einsum
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
+from torch import Tensor
+from typing import List, Union
 from einops import rearrange
 import math
 import comfy.ops
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@ -147,7 +147,6 @@ class DoubleAttention(nn.Module):

        bsz, seqlen1, _ = c.shape
        bsz, seqlen2, _ = x.shape
-        seqlen = seqlen1 + seqlen2

        cq, ck, cv = self.w1q(c), self.w1k(c), self.w1v(c)
        cq = cq.view(bsz, seqlen1, self.n_heads, self.head_dim)
@ -382,7 +381,6 @@ class MMDiT(nn.Module):
        pe_new = pe_as_2d.squeeze(0).permute(1, 2, 0).flatten(0, 1)
        self.positional_encoding.data = pe_new.unsqueeze(0).contiguous()
        self.h_max, self.w_max = target_dim
-        print("PE extended to", target_dim)

    def pe_selection_index_based_on_dim(self, h, w):
        h_p, w_p = h // self.patch_size, w // self.patch_size
--- a/comfy/ldm/cascade/controlnet.py
+++ b/comfy/ldm/cascade/controlnet.py
@ -16,7 +16,6 @@
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

-import torch
 import torchvision
 from torch import nn
 from .common import LayerNorm2d_op
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@ -19,6 +19,10 @@
 import torch
 from torch import nn
 from torch.autograd import Function
+import comfy.ops
+
+ops = comfy.ops.disable_weight_init
+

 class vector_quantize(Function):
    @staticmethod
@ -121,15 +125,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            nn.Conv2d(c, c, kernel_size=3, groups=c)
+            ops.Conv2d(c, c, kernel_size=3, groups=c)
        )

        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            nn.Linear(c, c_hidden),
+            ops.Linear(c, c_hidden),
            nn.GELU(),
-            nn.Linear(c_hidden, c),
+            ops.Linear(c_hidden, c),
        )

        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@ -171,16 +175,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@ -191,7 +195,7 @@ class StageA(nn.Module):

        # Decoder blocks
        up_blocks = [nn.Sequential(
-            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@ -199,11 +203,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )

@ -232,17 +236,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()

    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@ -19,6 +19,9 @@ import torch
 import torchvision
 from torch import nn

+import comfy.ops
+
+ops = comfy.ops.disable_weight_init

 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@ -26,7 +29,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@ -34,7 +37,7 @@ class EfficientNetEncoder(nn.Module):

    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
+        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
        o = self.mapper(self.backbone(x))
        return o

@ -44,39 +47,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )

    def forward(self, x):
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@ -1,27 +1,16 @@
 import torch
-import comfy.ops
+import comfy.rmsnorm
+

 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
    if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
        padding_mode = "reflect"
-    pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
-    pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
-    return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)

-try:
-    rms_norm_torch = torch.nn.functional.rms_norm
-except:
-    rms_norm_torch = None
+    pad = ()
+    for i in range(img.ndim - 2):
+        pad = (0, (patch_size[i] - img.shape[i + 2] % patch_size[i]) % patch_size[i]) + pad

-def rms_norm(x, weight=None, eps=1e-6):
-    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
-        if weight is None:
-            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
-        else:
-            return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
-    else:
-        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
-        if weight is None:
-            return r
-        else:
-            return r * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
+    return torch.nn.functional.pad(img, pad, mode=padding_mode)
+
+
+rms_norm = comfy.rmsnorm.rms_norm
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@ -0,0 +1,808 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional
+import logging
+
+import numpy as np
+import torch
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+from torch import nn
+
+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def apply_rotary_pos_emb(
+    t: torch.Tensor,
+    freqs: torch.Tensor,
+) -> torch.Tensor:
+    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
+    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
+    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
+    return t_out
+
+
+def get_normalization(name: str, channels: int, weight_args={}):
+    if name == "I":
+        return nn.Identity()
+    elif name == "R":
+        return RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
+    else:
+        raise ValueError(f"Normalization {name} not found")
+
+
+class BaseAttentionOp(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+
+class Attention(nn.Module):
+    """
+    Generalized attention impl.
+
+    Allowing for both self-attention and cross-attention configurations depending on whether a `context_dim` is provided.
+    If `context_dim` is None, self-attention is assumed.
+
+    Parameters:
+        query_dim (int): Dimension of each query vector.
+        context_dim (int, optional): Dimension of each context vector. If None, self-attention is assumed.
+        heads (int, optional): Number of attention heads. Defaults to 8.
+        dim_head (int, optional): Dimension of each head. Defaults to 64.
+        dropout (float, optional): Dropout rate applied to the output of the attention block. Defaults to 0.0.
+        attn_op (BaseAttentionOp, optional): Custom attention operation to be used instead of the default.
+        qkv_bias (bool, optional): If True, adds a learnable bias to query, key, and value projections. Defaults to False.
+        out_bias (bool, optional): If True, adds a learnable bias to the output projection. Defaults to False.
+        qkv_norm (str, optional): A string representing normalization strategies for query, key, and value projections.
+                                  Defaults to "SSI".
+        qkv_norm_mode (str, optional): A string representing normalization mode for query, key, and value projections.
+                                        Defaults to 'per_head'. Only support 'per_head'.
+
+    Examples:
+        >>> attn = Attention(query_dim=128, context_dim=256, heads=4, dim_head=32, dropout=0.1)
+        >>> query = torch.randn(10, 128)  # Batch size of 10
+        >>> context = torch.randn(10, 256)  # Batch size of 10
+        >>> output = attn(query, context)  # Perform the attention operation
+
+    Note:
+        https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim=None,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        attn_op: Optional[BaseAttentionOp] = None,
+        qkv_bias: bool = False,
+        out_bias: bool = False,
+        qkv_norm: str = "SSI",
+        qkv_norm_mode: str = "per_head",
+        backend: str = "transformer_engine",
+        qkv_format: str = "bshd",
+        weight_args={},
+        operations=None,
+    ) -> None:
+        super().__init__()
+
+        self.is_selfattn = context_dim is None  # self attention
+
+        inner_dim = dim_head * heads
+        context_dim = query_dim if context_dim is None else context_dim
+
+        self.heads = heads
+        self.dim_head = dim_head
+        self.qkv_norm_mode = qkv_norm_mode
+        self.qkv_format = qkv_format
+
+        if self.qkv_norm_mode == "per_head":
+            norm_dim = dim_head
+        else:
+            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
+
+        self.backend = backend
+
+        self.to_q = nn.Sequential(
+            operations.Linear(query_dim, inner_dim, bias=qkv_bias, **weight_args),
+            get_normalization(qkv_norm[0], norm_dim),
+        )
+        self.to_k = nn.Sequential(
+            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
+            get_normalization(qkv_norm[1], norm_dim),
+        )
+        self.to_v = nn.Sequential(
+            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
+            get_normalization(qkv_norm[2], norm_dim),
+        )
+
+        self.to_out = nn.Sequential(
+            operations.Linear(inner_dim, query_dim, bias=out_bias, **weight_args),
+            nn.Dropout(dropout),
+        )
+
+    def cal_qkv(
+        self, x, context=None, mask=None, rope_emb=None, **kwargs
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        del kwargs
+
+
+        """
+        self.to_q, self.to_k, self.to_v are nn.Sequential with projection + normalization layers.
+        Before 07/24/2024, these modules normalize across all heads.
+        After 07/24/2024, to support tensor parallelism and follow the common practice in the community,
+        we support to normalize per head.
+        To keep the checkpoint copatibility with the previous code,
+        we keep the nn.Sequential but call the projection and the normalization layers separately.
+        We use a flag `self.qkv_norm_mode` to control the normalization behavior.
+        The default value of `self.qkv_norm_mode` is "per_head", which means we normalize per head.
+        """
+        if self.qkv_norm_mode == "per_head":
+            q = self.to_q[0](x)
+            context = x if context is None else context
+            k = self.to_k[0](context)
+            v = self.to_v[0](context)
+            q, k, v = map(
+                lambda t: rearrange(t, "s b (n c) -> b n s c", n=self.heads, c=self.dim_head),
+                (q, k, v),
+            )
+        else:
+            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
+
+        q = self.to_q[1](q)
+        k = self.to_k[1](k)
+        v = self.to_v[1](v)
+        if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
+            # apply_rotary_pos_emb inlined
+            q_shape = q.shape
+            q = q.reshape(*q.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
+            q = rope_emb[..., 0] * q[..., 0] + rope_emb[..., 1] * q[..., 1]
+            q = q.movedim(-1, -2).reshape(*q_shape).to(x.dtype)
+
+            # apply_rotary_pos_emb inlined
+            k_shape = k.shape
+            k = k.reshape(*k.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
+            k = rope_emb[..., 0] * k[..., 0] + rope_emb[..., 1] * k[..., 1]
+            k = k.movedim(-1, -2).reshape(*k_shape).to(x.dtype)
+        return q, k, v
+
+    def forward(
+        self,
+        x,
+        context=None,
+        mask=None,
+        rope_emb=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            x (Tensor): The query tensor of shape [B, Mq, K]
+            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
+        """
+        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
+        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
+        del q, k, v
+        out = rearrange(out, " b n s c -> s b (n c)")
+        return self.to_out(out)
+
+
+class FeedForward(nn.Module):
+    """
+    Transformer FFN with optional gating
+
+    Parameters:
+        d_model (int): Dimensionality of input features.
+        d_ff (int): Dimensionality of the hidden layer.
+        dropout (float, optional): Dropout rate applied after the activation function. Defaults to 0.1.
+        activation (callable, optional): The activation function applied after the first linear layer.
+                                         Defaults to nn.ReLU().
+        is_gated (bool, optional): If set to True, incorporates gating mechanism to the feed-forward layer.
+                                   Defaults to False.
+        bias (bool, optional): If set to True, adds a bias to the linear layers. Defaults to True.
+
+    Example:
+        >>> ff = FeedForward(d_model=512, d_ff=2048)
+        >>> x = torch.randn(64, 10, 512)  # Example input tensor
+        >>> output = ff(x)
+        >>> print(output.shape)  # Expected shape: (64, 10, 512)
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        d_ff: int,
+        dropout: float = 0.1,
+        activation=nn.ReLU(),
+        is_gated: bool = False,
+        bias: bool = False,
+        weight_args={},
+        operations=None,
+    ) -> None:
+        super().__init__()
+
+        self.layer1 = operations.Linear(d_model, d_ff, bias=bias, **weight_args)
+        self.layer2 = operations.Linear(d_ff, d_model, bias=bias, **weight_args)
+
+        self.dropout = nn.Dropout(dropout)
+        self.activation = activation
+        self.is_gated = is_gated
+        if is_gated:
+            self.linear_gate = operations.Linear(d_model, d_ff, bias=False, **weight_args)
+
+    def forward(self, x: torch.Tensor):
+        g = self.activation(self.layer1(x))
+        if self.is_gated:
+            x = g * self.linear_gate(x)
+        else:
+            x = g
+        assert self.dropout.p == 0.0, "we skip dropout"
+        return self.layer2(x)
+
+
+class GPT2FeedForward(FeedForward):
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1, bias: bool = False, weight_args={}, operations=None):
+        super().__init__(
+            d_model=d_model,
+            d_ff=d_ff,
+            dropout=dropout,
+            activation=nn.GELU(),
+            is_gated=False,
+            bias=bias,
+            weight_args=weight_args,
+            operations=operations,
+        )
+
+    def forward(self, x: torch.Tensor):
+        assert self.dropout.p == 0.0, "we skip dropout"
+
+        x = self.layer1(x)
+        x = self.activation(x)
+        x = self.layer2(x)
+
+        return x
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.num_channels = num_channels
+
+    def forward(self, timesteps):
+        half_dim = self.num_channels // 2
+        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
+        exponent = exponent / (half_dim - 0.0)
+
+        emb = torch.exp(exponent)
+        emb = timesteps[:, None].float() * emb[None, :]
+
+        sin_emb = torch.sin(emb)
+        cos_emb = torch.cos(emb)
+        emb = torch.cat([cos_emb, sin_emb], dim=-1)
+
+        return emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False, weight_args={}, operations=None):
+        super().__init__()
+        logging.debug(
+            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
+        )
+        self.linear_1 = operations.Linear(in_features, out_features, bias=not use_adaln_lora, **weight_args)
+        self.activation = nn.SiLU()
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.linear_2 = operations.Linear(out_features, 3 * out_features, bias=False, **weight_args)
+        else:
+            self.linear_2 = operations.Linear(out_features, out_features, bias=True, **weight_args)
+
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        emb = self.linear_1(sample)
+        emb = self.activation(emb)
+        emb = self.linear_2(emb)
+
+        if self.use_adaln_lora:
+            adaln_lora_B_3D = emb
+            emb_B_D = sample
+        else:
+            emb_B_D = emb
+            adaln_lora_B_3D = None
+
+        return emb_B_D, adaln_lora_B_3D
+
+
+class FourierFeatures(nn.Module):
+    """
+    Implements a layer that generates Fourier features from input tensors, based on randomly sampled
+    frequencies and phases. This can help in learning high-frequency functions in low-dimensional problems.
+
+    [B] -> [B, D]
+
+    Parameters:
+        num_channels (int): The number of Fourier features to generate.
+        bandwidth (float, optional): The scaling factor for the frequency of the Fourier features. Defaults to 1.
+        normalize (bool, optional): If set to True, the outputs are scaled by sqrt(2), usually to normalize
+                                    the variance of the features. Defaults to False.
+
+    Example:
+        >>> layer = FourierFeatures(num_channels=256, bandwidth=0.5, normalize=True)
+        >>> x = torch.randn(10, 256)  # Example input tensor
+        >>> output = layer(x)
+        >>> print(output.shape)  # Expected shape: (10, 256)
+    """
+
+    def __init__(self, num_channels, bandwidth=1, normalize=False):
+        super().__init__()
+        self.register_buffer("freqs", 2 * np.pi * bandwidth * torch.randn(num_channels), persistent=True)
+        self.register_buffer("phases", 2 * np.pi * torch.rand(num_channels), persistent=True)
+        self.gain = np.sqrt(2) if normalize else 1
+
+    def forward(self, x, gain: float = 1.0):
+        """
+        Apply the Fourier feature transformation to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+            gain (float, optional): An additional gain factor applied during the forward pass. Defaults to 1.
+
+        Returns:
+            torch.Tensor: The transformed tensor, with Fourier features applied.
+        """
+        in_dtype = x.dtype
+        x = x.to(torch.float32).ger(self.freqs.to(torch.float32)).add(self.phases.to(torch.float32))
+        x = x.cos().mul(self.gain * gain).to(in_dtype)
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
+    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
+    making it suitable for video and image processing tasks. It supports dividing the input into patches
+    and embedding each patch into a vector of size `out_channels`.
+
+    Parameters:
+    - spatial_patch_size (int): The size of each spatial patch.
+    - temporal_patch_size (int): The size of each temporal patch.
+    - in_channels (int): Number of input channels. Default: 3.
+    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
+    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
+    """
+
+    def __init__(
+        self,
+        spatial_patch_size,
+        temporal_patch_size,
+        in_channels=3,
+        out_channels=768,
+        bias=True,
+        weight_args={},
+        operations=None,
+    ):
+        super().__init__()
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.proj = nn.Sequential(
+            Rearrange(
+                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
+                r=temporal_patch_size,
+                m=spatial_patch_size,
+                n=spatial_patch_size,
+            ),
+            operations.Linear(
+                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=bias, **weight_args
+            ),
+        )
+        self.out = nn.Identity()
+
+    def forward(self, x):
+        """
+        Forward pass of the PatchEmbed module.
+
+        Parameters:
+        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
+            B is the batch size,
+            C is the number of channels,
+            T is the temporal dimension,
+            H is the height, and
+            W is the width of the input.
+
+        Returns:
+        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
+        """
+        assert x.dim() == 5
+        _, _, T, H, W = x.shape
+        assert H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
+        assert T % self.temporal_patch_size == 0
+        x = self.proj(x)
+        return self.out(x)
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of video DiT.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        spatial_patch_size,
+        temporal_patch_size,
+        out_channels,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        weight_args={},
+        operations=None,
+    ):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **weight_args)
+        self.linear = operations.Linear(
+            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, **weight_args
+        )
+        self.hidden_size = hidden_size
+        self.n_adaln_chunks = 2
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(hidden_size, adaln_lora_dim, bias=False, **weight_args),
+                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False, **weight_args),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(), operations.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False, **weight_args)
+            )
+
+    def forward(
+        self,
+        x_BT_HW_D,
+        emb_B_D,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ):
+        if self.use_adaln_lora:
+            assert adaln_lora_B_3D is not None
+            shift_B_D, scale_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D[:, : 2 * self.hidden_size]).chunk(
+                2, dim=1
+            )
+        else:
+            shift_B_D, scale_B_D = self.adaLN_modulation(emb_B_D).chunk(2, dim=1)
+
+        B = emb_B_D.shape[0]
+        T = x_BT_HW_D.shape[0] // B
+        shift_BT_D, scale_BT_D = repeat(shift_B_D, "b d -> (b t) d", t=T), repeat(scale_B_D, "b d -> (b t) d", t=T)
+        x_BT_HW_D = modulate(self.norm_final(x_BT_HW_D), shift_BT_D, scale_BT_D)
+
+        x_BT_HW_D = self.linear(x_BT_HW_D)
+        return x_BT_HW_D
+
+
+class VideoAttn(nn.Module):
+    """
+    Implements video attention with optional cross-attention capabilities.
+
+    This module processes video features while maintaining their spatio-temporal structure. It can perform
+    self-attention within the video features or cross-attention with external context features.
+
+    Parameters:
+        x_dim (int): Dimension of input feature vectors
+        context_dim (Optional[int]): Dimension of context features for cross-attention. None for self-attention
+        num_heads (int): Number of attention heads
+        bias (bool): Whether to include bias in attention projections. Default: False
+        qkv_norm_mode (str): Normalization mode for query/key/value projections. Must be "per_head". Default: "per_head"
+        x_format (str): Format of input tensor. Must be "BTHWD". Default: "BTHWD"
+
+    Input shape:
+        - x: (T, H, W, B, D) video features
+        - context (optional): (M, B, D) context features for cross-attention
+        where:
+            T: temporal dimension
+            H: height
+            W: width
+            B: batch size
+            D: feature dimension
+            M: context sequence length
+    """
+
+    def __init__(
+        self,
+        x_dim: int,
+        context_dim: Optional[int],
+        num_heads: int,
+        bias: bool = False,
+        qkv_norm_mode: str = "per_head",
+        x_format: str = "BTHWD",
+        weight_args={},
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.x_format = x_format
+
+        self.attn = Attention(
+            x_dim,
+            context_dim,
+            num_heads,
+            x_dim // num_heads,
+            qkv_bias=bias,
+            qkv_norm="RRI",
+            out_bias=bias,
+            qkv_norm_mode=qkv_norm_mode,
+            qkv_format="sbhd",
+            weight_args=weight_args,
+            operations=operations,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for video attention.
+
+        Args:
+            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D) representing batches of video data.
+            context (Tensor): Context tensor of shape (B, M, D) or (M, B, D),
+            where M is the sequence length of the context.
+            crossattn_mask (Optional[Tensor]): An optional mask for cross-attention mechanisms.
+            rope_emb_L_1_1_D (Optional[Tensor]):
+            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
+
+        Returns:
+            Tensor: The output tensor with applied attention, maintaining the input shape.
+        """
+
+        x_T_H_W_B_D = x
+        context_M_B_D = context
+        T, H, W, B, D = x_T_H_W_B_D.shape
+        x_THW_B_D = rearrange(x_T_H_W_B_D, "t h w b d -> (t h w) b d")
+        x_THW_B_D = self.attn(
+            x_THW_B_D,
+            context_M_B_D,
+            crossattn_mask,
+            rope_emb=rope_emb_L_1_1_D,
+        )
+        x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W)
+        return x_T_H_W_B_D
+
+
+def adaln_norm_state(norm_state, x, scale, shift):
+    normalized = norm_state(x)
+    return normalized * (1 + scale) + shift
+
+
+class DITBuildingBlock(nn.Module):
+    """
+    A building block for the DiT (Diffusion Transformer) architecture that supports different types of
+    attention and MLP operations with adaptive layer normalization.
+
+    Parameters:
+        block_type (str): Type of block - one of:
+            - "cross_attn"/"ca": Cross-attention
+            - "full_attn"/"fa": Full self-attention
+            - "mlp"/"ff": MLP/feedforward block
+        x_dim (int): Dimension of input features
+        context_dim (Optional[int]): Dimension of context features for cross-attention
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
+        bias (bool): Whether to use bias in layers. Default: False
+        mlp_dropout (float): Dropout rate for MLP. Default: 0.0
+        qkv_norm_mode (str): QKV normalization mode. Default: "per_head"
+        x_format (str): Input tensor format. Default: "BTHWD"
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
+    """
+
+    def __init__(
+        self,
+        block_type: str,
+        x_dim: int,
+        context_dim: Optional[int],
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        bias: bool = False,
+        mlp_dropout: float = 0.0,
+        qkv_norm_mode: str = "per_head",
+        x_format: str = "BTHWD",
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        weight_args={},
+        operations=None
+    ) -> None:
+        block_type = block_type.lower()
+
+        super().__init__()
+        self.x_format = x_format
+        if block_type in ["cross_attn", "ca"]:
+            self.block = VideoAttn(
+                x_dim,
+                context_dim,
+                num_heads,
+                bias=bias,
+                qkv_norm_mode=qkv_norm_mode,
+                x_format=self.x_format,
+                weight_args=weight_args,
+                operations=operations,
+            )
+        elif block_type in ["full_attn", "fa"]:
+            self.block = VideoAttn(
+                x_dim, None, num_heads, bias=bias, qkv_norm_mode=qkv_norm_mode, x_format=self.x_format, weight_args=weight_args, operations=operations
+            )
+        elif block_type in ["mlp", "ff"]:
+            self.block = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), dropout=mlp_dropout, bias=bias, weight_args=weight_args, operations=operations)
+        else:
+            raise ValueError(f"Unknown block type: {block_type}")
+
+        self.block_type = block_type
+        self.use_adaln_lora = use_adaln_lora
+
+        self.norm_state = nn.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6)
+        self.n_adaln_chunks = 3
+        if use_adaln_lora:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(x_dim, adaln_lora_dim, bias=False, **weight_args),
+                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * x_dim, bias=False, **weight_args),
+            )
+        else:
+            self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, self.n_adaln_chunks * x_dim, bias=False, **weight_args))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb_B_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for dynamically configured blocks with adaptive normalization.
+
+        Args:
+            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D).
+            emb_B_D (Tensor): Embedding tensor for adaptive layer normalization modulation.
+            crossattn_emb (Tensor): Tensor for cross-attention blocks.
+            crossattn_mask (Optional[Tensor]): Optional mask for cross-attention.
+            rope_emb_L_1_1_D (Optional[Tensor]):
+            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
+
+        Returns:
+            Tensor: The output tensor after processing through the configured block and adaptive normalization.
+        """
+        if self.use_adaln_lora:
+            shift_B_D, scale_B_D, gate_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D).chunk(
+                self.n_adaln_chunks, dim=1
+            )
+        else:
+            shift_B_D, scale_B_D, gate_B_D = self.adaLN_modulation(emb_B_D).chunk(self.n_adaln_chunks, dim=1)
+
+        shift_1_1_1_B_D, scale_1_1_1_B_D, gate_1_1_1_B_D = (
+            shift_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+            scale_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+            gate_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
+        )
+
+        if self.block_type in ["mlp", "ff"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+            )
+        elif self.block_type in ["full_attn", "fa"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+                context=None,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+            )
+        elif self.block_type in ["cross_attn", "ca"]:
+            x = x + gate_1_1_1_B_D * self.block(
+                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
+                context=crossattn_emb,
+                crossattn_mask=crossattn_mask,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+            )
+        else:
+            raise ValueError(f"Unknown block type: {self.block_type}")
+
+        return x
+
+
+class GeneralDITTransformerBlock(nn.Module):
+    """
+    A wrapper module that manages a sequence of DITBuildingBlocks to form a complete transformer layer.
+    Each block in the sequence is specified by a block configuration string.
+
+    Parameters:
+        x_dim (int): Dimension of input features
+        context_dim (int): Dimension of context features for cross-attention blocks
+        num_heads (int): Number of attention heads
+        block_config (str): String specifying block sequence (e.g. "ca-fa-mlp" for cross-attention,
+                          full-attention, then MLP)
+        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
+        x_format (str): Input tensor format. Default: "BTHWD"
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
+
+    The block_config string uses "-" to separate block types:
+        - "ca"/"cross_attn": Cross-attention block
+        - "fa"/"full_attn": Full self-attention block
+        - "mlp"/"ff": MLP/feedforward block
+
+    Example:
+        block_config = "ca-fa-mlp" creates a sequence of:
+        1. Cross-attention block
+        2. Full self-attention block
+        3. MLP block
+    """
+
+    def __init__(
+        self,
+        x_dim: int,
+        context_dim: int,
+        num_heads: int,
+        block_config: str,
+        mlp_ratio: float = 4.0,
+        x_format: str = "BTHWD",
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        weight_args={},
+        operations=None
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        self.x_format = x_format
+        for block_type in block_config.split("-"):
+            self.blocks.append(
+                DITBuildingBlock(
+                    block_type,
+                    x_dim,
+                    context_dim,
+                    num_heads,
+                    mlp_ratio,
+                    x_format=self.x_format,
+                    use_adaln_lora=use_adaln_lora,
+                    adaln_lora_dim=adaln_lora_dim,
+                    weight_args=weight_args,
+                    operations=operations,
+                )
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        emb_B_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for block in self.blocks:
+            x = block(
+                x,
+                emb_B_D,
+                crossattn_emb,
+                crossattn_mask,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+                adaln_lora_B_3D=adaln_lora_B_3D,
+            )
+        return x
--- a/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
--- a/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
@ -0,0 +1,377 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The patcher and unpatcher implementation for 2D and 3D data.
+
+The idea of Haar wavelet is to compute LL, LH, HL, HH component as two 1D convolutions.
+One on the rows and one on the columns.
+For example, in 1D signal, we have [a, b], then the low-freq compoenent is [a + b] / 2 and high-freq is [a - b] / 2.
+We can use a 1D convolution with kernel [1, 1] and stride 2 to represent the L component.
+For H component, we can use a 1D convolution with kernel [1, -1] and stride 2.
+Although in principle, we typically only do additional Haar wavelet over the LL component. But here we do it for all
+   as we need to support downsampling for more than 2x.
+For example, 4x downsampling can be done by 2x Haar and additional 2x Haar, and the shape would be.
+   [3, 256, 256] -> [12, 128, 128] -> [48, 64, 64]
+"""
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+_WAVELETS = {
+    "haar": torch.tensor([0.7071067811865476, 0.7071067811865476]),
+    "rearrange": torch.tensor([1.0, 1.0]),
+}
+_PERSISTENT = False
+
+
+class Patcher(torch.nn.Module):
+    """A module to convert image tensors into patches using torch operations.
+
+    The main difference from `class Patching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+
+    It's bit-wise identical to the Patching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer(
+            "wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT
+        )
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer(
+            "_arange",
+            torch.arange(_WAVELETS[patch_method].shape[0]),
+            persistent=_PERSISTENT,
+        )
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._haar(x)
+        elif self.patch_method == "rearrange":
+            return self._arrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+
+    def _dwt(self, x, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets.to(device=x.device)
+
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+
+        x = F.pad(x, pad=(n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
+        xl = F.conv2d(x, hl.unsqueeze(2), groups=g, stride=(1, 2))
+        xh = F.conv2d(x, hh.unsqueeze(2), groups=g, stride=(1, 2))
+        xll = F.conv2d(xl, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xlh = F.conv2d(xl, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        xhl = F.conv2d(xh, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xhh = F.conv2d(xh, hh.unsqueeze(3), groups=g, stride=(2, 1))
+
+        out = torch.cat([xll, xlh, xhl, xhh], dim=1)
+        if rescale:
+            out = out / 2
+        return out
+
+    def _haar(self, x):
+        for _ in self.range:
+            x = self._dwt(x, rescale=True)
+        return x
+
+    def _arrange(self, x):
+        x = rearrange(
+            x,
+            "b c (h p1) (w p2) -> b (c p1 p2) h w",
+            p1=self.patch_size,
+            p2=self.patch_size,
+        ).contiguous()
+        return x
+
+
+class Patcher3D(Patcher):
+    """A 3D discrete wavelet transform for video data, expects 5D tensor, i.e. a batch of videos."""
+
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+        self.register_buffer(
+            "patch_size_buffer",
+            patch_size * torch.ones([1], dtype=torch.int32),
+            persistent=_PERSISTENT,
+        )
+
+    def _dwt(self, x, wavelet, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets.to(device=x.device)
+
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+
+        # Handles temporal axis.
+        x = F.pad(
+            x, pad=(max(0, n - 2), n - 1, n - 2, n - 1, n - 2, n - 1), mode=mode
+        ).to(dtype)
+        xl = F.conv3d(x, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        xh = F.conv3d(x, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+
+        # Handles spatial axes.
+        xll = F.conv3d(xl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xlh = F.conv3d(xl, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhl = F.conv3d(xh, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhh = F.conv3d(xh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+
+        xlll = F.conv3d(xll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xllh = F.conv3d(xll, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhl = F.conv3d(xlh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhh = F.conv3d(xlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhll = F.conv3d(xhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhlh = F.conv3d(xhl, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhl = F.conv3d(xhh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhh = F.conv3d(xhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+
+        out = torch.cat([xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh], dim=1)
+        if rescale:
+            out = out / (2 * torch.sqrt(torch.tensor(2.0)))
+        return out
+
+    def _haar(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        for _ in self.range:
+            x = self._dwt(x, "haar", rescale=True)
+        return x
+
+    def _arrange(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        x = rearrange(
+            x,
+            "b c (t p1) (h p2) (w p3) -> b (c p1 p2 p3) t h w",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        ).contiguous()
+        return x
+
+
+class UnPatcher(torch.nn.Module):
+    """A module to convert patches into image tensorsusing torch operations.
+
+    The main difference from `class Unpatching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+
+    It's bit-wise identical to the Unpatching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer(
+            "wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT
+        )
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer(
+            "_arange",
+            torch.arange(_WAVELETS[patch_method].shape[0]),
+            persistent=_PERSISTENT,
+        )
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._ihaar(x)
+        elif self.patch_method == "rearrange":
+            return self._iarrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+
+    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets.to(device=x.device)
+        n = h.shape[0]
+
+        g = x.shape[1] // 4
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+
+        xll, xlh, xhl, xhh = torch.chunk(x.to(dtype), 4, dim=1)
+
+        # Inverse transform.
+        yl = torch.nn.functional.conv_transpose2d(
+            xll, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yl += torch.nn.functional.conv_transpose2d(
+            xlh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yh = torch.nn.functional.conv_transpose2d(
+            xhl, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        yh += torch.nn.functional.conv_transpose2d(
+            xhh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
+        )
+        y = torch.nn.functional.conv_transpose2d(
+            yl, hl.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
+        )
+        y += torch.nn.functional.conv_transpose2d(
+            yh, hh.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
+        )
+
+        if rescale:
+            y = y * 2
+        return y
+
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, "haar", rescale=True)
+        return x
+
+    def _iarrange(self, x):
+        x = rearrange(
+            x,
+            "b (c p1 p2) h w -> b c (h p1) (w p2)",
+            p1=self.patch_size,
+            p2=self.patch_size,
+        )
+        return x
+
+
+class UnPatcher3D(UnPatcher):
+    """A 3D inverse discrete wavelet transform for video wavelet decompositions."""
+
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+
+    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets.to(device=x.device)
+
+        g = x.shape[1] // 8  # split into 8 spatio-temporal filtered tesnors.
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
+        hl = hl.to(dtype=dtype)
+        hh = hh.to(dtype=dtype)
+
+        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
+        del x
+
+        # Height height transposed convolutions.
+        xll = F.conv_transpose3d(
+            xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xlll
+
+        xll += F.conv_transpose3d(
+            xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xllh
+
+        xlh = F.conv_transpose3d(
+            xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xlhl
+
+        xlh += F.conv_transpose3d(
+            xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xlhh
+
+        xhl = F.conv_transpose3d(
+            xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xhll
+
+        xhl += F.conv_transpose3d(
+            xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xhlh
+
+        xhh = F.conv_transpose3d(
+            xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xhhl
+
+        xhh += F.conv_transpose3d(
+            xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
+        )
+        del xhhh
+
+        # Handles width transposed convolutions.
+        xl = F.conv_transpose3d(
+            xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
+        )
+        del xll
+
+        xl += F.conv_transpose3d(
+            xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
+        )
+        del xlh
+
+        xh = F.conv_transpose3d(
+            xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
+        )
+        del xhl
+
+        xh += F.conv_transpose3d(
+            xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
+        )
+        del xhh
+
+        # Handles time axis transposed convolutions.
+        x = F.conv_transpose3d(
+            xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
+        )
+        del xl
+
+        x += F.conv_transpose3d(
+            xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
+        )
+
+        if rescale:
+            x = x * (2 * torch.sqrt(torch.tensor(2.0)))
+        return x
+
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, "haar", rescale=True)
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x
+
+    def _iarrange(self, x):
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) t h w -> b c (t p1) (h p2) (w p3)",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        )
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x
--- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared utilities for the networks module."""
+
+from typing import Any
+
+import torch
+from einops import rearrange
+
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+def time2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size = x.shape[0]
+    return rearrange(x, "b c t h w -> (b t) c h w"), batch_size
+
+
+def batch2time(x: torch.Tensor, batch_size: int) -> torch.Tensor:
+    return rearrange(x, "(b t) c h w -> b c t h w", b=batch_size)
+
+
+def space2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size, height = x.shape[0], x.shape[-2]
+    return rearrange(x, "b c t h w -> (b h w) c t"), batch_size, height
+
+
+def batch2space(x: torch.Tensor, batch_size: int, height: int) -> torch.Tensor:
+    return rearrange(x, "(b h w) c t -> b c t h w", b=batch_size, h=height)
+
+
+def cast_tuple(t: Any, length: int = 1) -> Any:
+    return t if isinstance(t, tuple) else ((t,) * length)
+
+
+def replication_pad(x):
+    return torch.cat([x[:, :, :1, ...], x], dim=2)
+
+
+def divisible_by(num: int, den: int) -> bool:
+    return (num % den) == 0
+
+
+def is_odd(n: int) -> bool:
+    return not divisible_by(n, 2)
+
+
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return ops.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+
+
+class CausalNormalize(torch.nn.Module):
+    def __init__(self, in_channels, num_groups=1):
+        super().__init__()
+        self.norm = ops.GroupNorm(
+            num_groups=num_groups,
+            num_channels=in_channels,
+            eps=1e-6,
+            affine=True,
+        )
+        self.num_groups = num_groups
+
+    def forward(self, x):
+        # if num_groups !=1, we apply a spatio-temporal groupnorm for backward compatibility purpose.
+        # All new models should use num_groups=1, otherwise causality is not guaranteed.
+        if self.num_groups == 1:
+            x, batch_size = time2batch(x)
+            return batch2time(self.norm(x), batch_size)
+        return self.norm(x)
+
+
+def exists(v):
+    return v is not None
+
+
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+
+
+def round_ste(z: torch.Tensor) -> torch.Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+
+
+def log(t, eps=1e-5):
+    return t.clamp(min=eps).log()
+
+
+def entropy(prob):
+    return (-prob * log(prob)).sum(dim=-1)
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@ -0,0 +1,514 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
+"""
+
+from typing import Optional, Tuple
+
+import torch
+from einops import rearrange
+from torch import nn
+from torchvision import transforms
+
+from enum import Enum
+import logging
+
+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
+
+from .blocks import (
+    FinalLayer,
+    GeneralDITTransformerBlock,
+    PatchEmbed,
+    TimestepEmbedding,
+    Timesteps,
+)
+
+from .position_embedding import LearnablePosEmbAxis, VideoRopePosition3DEmb
+
+
+class DataType(Enum):
+    IMAGE = "image"
+    VIDEO = "video"
+
+
+class GeneralDIT(nn.Module):
+    """
+    A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
+
+    Args:
+        max_img_h (int): Maximum height of the input images.
+        max_img_w (int): Maximum width of the input images.
+        max_frames (int): Maximum number of frames in the video sequence.
+        in_channels (int): Number of input channels (e.g., RGB channels for color images).
+        out_channels (int): Number of output channels.
+        patch_spatial (tuple): Spatial resolution of patches for input processing.
+        patch_temporal (int): Temporal resolution of patches for input processing.
+        concat_padding_mask (bool): If True, includes a mask channel in the input to handle padding.
+        block_config (str): Configuration of the transformer block. See Notes for supported block types.
+        model_channels (int): Base number of channels used throughout the model.
+        num_blocks (int): Number of transformer blocks.
+        num_heads (int): Number of heads in the multi-head attention layers.
+        mlp_ratio (float): Expansion ratio for MLP blocks.
+        block_x_format (str): Format of input tensor for transformer blocks ('BTHWD' or 'THWBD').
+        crossattn_emb_channels (int): Number of embedding channels for cross-attention.
+        use_cross_attn_mask (bool): Whether to use mask in cross-attention.
+        pos_emb_cls (str): Type of positional embeddings.
+        pos_emb_learnable (bool): Whether positional embeddings are learnable.
+        pos_emb_interpolation (str): Method for interpolating positional embeddings.
+        affline_emb_norm (bool): Whether to normalize affine embeddings.
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA.
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA.
+        rope_h_extrapolation_ratio (float): Height extrapolation ratio for RoPE.
+        rope_w_extrapolation_ratio (float): Width extrapolation ratio for RoPE.
+        rope_t_extrapolation_ratio (float): Temporal extrapolation ratio for RoPE.
+        extra_per_block_abs_pos_emb (bool): Whether to use extra per-block absolute positional embeddings.
+        extra_per_block_abs_pos_emb_type (str): Type of extra per-block positional embeddings.
+        extra_h_extrapolation_ratio (float): Height extrapolation ratio for extra embeddings.
+        extra_w_extrapolation_ratio (float): Width extrapolation ratio for extra embeddings.
+        extra_t_extrapolation_ratio (float): Temporal extrapolation ratio for extra embeddings.
+
+    Notes:
+        Supported block types in block_config:
+        * cross_attn, ca: Cross attention
+        * full_attn: Full attention on all flattened tokens
+        * mlp, ff: Feed forward block
+    """
+
+    def __init__(
+        self,
+        max_img_h: int,
+        max_img_w: int,
+        max_frames: int,
+        in_channels: int,
+        out_channels: int,
+        patch_spatial: tuple,
+        patch_temporal: int,
+        concat_padding_mask: bool = True,
+        # attention settings
+        block_config: str = "FA-CA-MLP",
+        model_channels: int = 768,
+        num_blocks: int = 10,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        block_x_format: str = "BTHWD",
+        # cross attention settings
+        crossattn_emb_channels: int = 1024,
+        use_cross_attn_mask: bool = False,
+        # positional embedding settings
+        pos_emb_cls: str = "sincos",
+        pos_emb_learnable: bool = False,
+        pos_emb_interpolation: str = "crop",
+        affline_emb_norm: bool = False,  # whether or not to normalize the affine embedding
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        rope_h_extrapolation_ratio: float = 1.0,
+        rope_w_extrapolation_ratio: float = 1.0,
+        rope_t_extrapolation_ratio: float = 1.0,
+        extra_per_block_abs_pos_emb: bool = False,
+        extra_per_block_abs_pos_emb_type: str = "sincos",
+        extra_h_extrapolation_ratio: float = 1.0,
+        extra_w_extrapolation_ratio: float = 1.0,
+        extra_t_extrapolation_ratio: float = 1.0,
+        image_model=None,
+        device=None,
+        dtype=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.max_img_h = max_img_h
+        self.max_img_w = max_img_w
+        self.max_frames = max_frames
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+        self.num_heads = num_heads
+        self.num_blocks = num_blocks
+        self.model_channels = model_channels
+        self.use_cross_attn_mask = use_cross_attn_mask
+        self.concat_padding_mask = concat_padding_mask
+        # positional embedding settings
+        self.pos_emb_cls = pos_emb_cls
+        self.pos_emb_learnable = pos_emb_learnable
+        self.pos_emb_interpolation = pos_emb_interpolation
+        self.affline_emb_norm = affline_emb_norm
+        self.rope_h_extrapolation_ratio = rope_h_extrapolation_ratio
+        self.rope_w_extrapolation_ratio = rope_w_extrapolation_ratio
+        self.rope_t_extrapolation_ratio = rope_t_extrapolation_ratio
+        self.extra_per_block_abs_pos_emb = extra_per_block_abs_pos_emb
+        self.extra_per_block_abs_pos_emb_type = extra_per_block_abs_pos_emb_type.lower()
+        self.extra_h_extrapolation_ratio = extra_h_extrapolation_ratio
+        self.extra_w_extrapolation_ratio = extra_w_extrapolation_ratio
+        self.extra_t_extrapolation_ratio = extra_t_extrapolation_ratio
+        self.dtype = dtype
+        weight_args = {"device": device, "dtype": dtype}
+
+        in_channels = in_channels + 1 if concat_padding_mask else in_channels
+        self.x_embedder = PatchEmbed(
+            spatial_patch_size=patch_spatial,
+            temporal_patch_size=patch_temporal,
+            in_channels=in_channels,
+            out_channels=model_channels,
+            bias=False,
+            weight_args=weight_args,
+            operations=operations,
+        )
+
+        self.build_pos_embed(device=device, dtype=dtype)
+        self.block_x_format = block_x_format
+        self.use_adaln_lora = use_adaln_lora
+        self.adaln_lora_dim = adaln_lora_dim
+        self.t_embedder = nn.ModuleList(
+            [Timesteps(model_channels),
+             TimestepEmbedding(model_channels, model_channels, use_adaln_lora=use_adaln_lora, weight_args=weight_args, operations=operations),]
+        )
+
+        self.blocks = nn.ModuleDict()
+
+        for idx in range(num_blocks):
+            self.blocks[f"block{idx}"] = GeneralDITTransformerBlock(
+                x_dim=model_channels,
+                context_dim=crossattn_emb_channels,
+                num_heads=num_heads,
+                block_config=block_config,
+                mlp_ratio=mlp_ratio,
+                x_format=self.block_x_format,
+                use_adaln_lora=use_adaln_lora,
+                adaln_lora_dim=adaln_lora_dim,
+                weight_args=weight_args,
+                operations=operations,
+            )
+
+        if self.affline_emb_norm:
+            logging.debug("Building affine embedding normalization layer")
+            self.affline_norm = RMSNorm(model_channels, elementwise_affine=True, eps=1e-6)
+        else:
+            self.affline_norm = nn.Identity()
+
+        self.final_layer = FinalLayer(
+            hidden_size=self.model_channels,
+            spatial_patch_size=self.patch_spatial,
+            temporal_patch_size=self.patch_temporal,
+            out_channels=self.out_channels,
+            use_adaln_lora=self.use_adaln_lora,
+            adaln_lora_dim=self.adaln_lora_dim,
+            weight_args=weight_args,
+            operations=operations,
+        )
+
+    def build_pos_embed(self, device=None, dtype=None):
+        if self.pos_emb_cls == "rope3d":
+            cls_type = VideoRopePosition3DEmb
+        else:
+            raise ValueError(f"Unknown pos_emb_cls {self.pos_emb_cls}")
+
+        logging.debug(f"Building positional embedding with {self.pos_emb_cls} class, impl {cls_type}")
+        kwargs = dict(
+            model_channels=self.model_channels,
+            len_h=self.max_img_h // self.patch_spatial,
+            len_w=self.max_img_w // self.patch_spatial,
+            len_t=self.max_frames // self.patch_temporal,
+            is_learnable=self.pos_emb_learnable,
+            interpolation=self.pos_emb_interpolation,
+            head_dim=self.model_channels // self.num_heads,
+            h_extrapolation_ratio=self.rope_h_extrapolation_ratio,
+            w_extrapolation_ratio=self.rope_w_extrapolation_ratio,
+            t_extrapolation_ratio=self.rope_t_extrapolation_ratio,
+            device=device,
+        )
+        self.pos_embedder = cls_type(
+            **kwargs,
+        )
+
+        if self.extra_per_block_abs_pos_emb:
+            assert self.extra_per_block_abs_pos_emb_type in [
+                "learnable",
+            ], f"Unknown extra_per_block_abs_pos_emb_type {self.extra_per_block_abs_pos_emb_type}"
+            kwargs["h_extrapolation_ratio"] = self.extra_h_extrapolation_ratio
+            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
+            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
+            kwargs["device"] = device
+            kwargs["dtype"] = dtype
+            self.extra_pos_embedder = LearnablePosEmbAxis(
+                **kwargs,
+            )
+
+    def prepare_embedded_sequence(
+        self,
+        x_B_C_T_H_W: torch.Tensor,
+        fps: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        latent_condition: Optional[torch.Tensor] = None,
+        latent_condition_sigma: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
+
+        Args:
+            x_B_C_T_H_W (torch.Tensor): video
+            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
+                                    If None, a default value (`self.base_fps`) will be used.
+            padding_mask (Optional[torch.Tensor]): current it is not used
+
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
+                - An optional positional embedding tensor, returned only if the positional embedding class
+                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
+
+        Notes:
+            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
+            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
+            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
+                the `self.pos_embedder` with the shape [T, H, W].
+            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the
+            `self.pos_embedder` with the fps tensor.
+            - Otherwise, the positional embeddings are generated without considering fps.
+        """
+        if self.concat_padding_mask:
+            if padding_mask is not None:
+                padding_mask = transforms.functional.resize(
+                    padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
+                )
+            else:
+                padding_mask = torch.zeros((x_B_C_T_H_W.shape[0], 1, x_B_C_T_H_W.shape[-2], x_B_C_T_H_W.shape[-1]), dtype=x_B_C_T_H_W.dtype, device=x_B_C_T_H_W.device)
+
+            x_B_C_T_H_W = torch.cat(
+                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
+            )
+        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
+
+        if self.extra_per_block_abs_pos_emb:
+            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
+        else:
+            extra_pos_emb = None
+
+        if "rope" in self.pos_emb_cls.lower():
+            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device), extra_pos_emb
+
+        if "fps_aware" in self.pos_emb_cls:
+            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
+        else:
+            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
+
+        return x_B_T_H_W_D, None, extra_pos_emb
+
+    def decoder_head(
+        self,
+        x_B_T_H_W_D: torch.Tensor,
+        emb_B_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        origin_shape: Tuple[int, int, int, int, int],  # [B, C, T, H, W]
+        crossattn_mask: Optional[torch.Tensor] = None,
+        adaln_lora_B_3D: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        del crossattn_emb, crossattn_mask
+        B, C, T_before_patchify, H_before_patchify, W_before_patchify = origin_shape
+        x_BT_HW_D = rearrange(x_B_T_H_W_D, "B T H W D -> (B T) (H W) D")
+        x_BT_HW_D = self.final_layer(x_BT_HW_D, emb_B_D, adaln_lora_B_3D=adaln_lora_B_3D)
+        # This is to ensure x_BT_HW_D has the correct shape because
+        # when we merge T, H, W into one dimension, x_BT_HW_D has shape (B * T * H * W, 1*1, D).
+        x_BT_HW_D = x_BT_HW_D.view(
+            B * T_before_patchify // self.patch_temporal,
+            H_before_patchify // self.patch_spatial * W_before_patchify // self.patch_spatial,
+            -1,
+        )
+        x_B_D_T_H_W = rearrange(
+            x_BT_HW_D,
+            "(B T) (H W) (p1 p2 t C) -> B C (T t) (H p1) (W p2)",
+            p1=self.patch_spatial,
+            p2=self.patch_spatial,
+            H=H_before_patchify // self.patch_spatial,
+            W=W_before_patchify // self.patch_spatial,
+            t=self.patch_temporal,
+            B=B,
+        )
+        return x_B_D_T_H_W
+
+    def forward_before_blocks(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        crossattn_mask: Optional[torch.Tensor] = None,
+        fps: Optional[torch.Tensor] = None,
+        image_size: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        scalar_feature: Optional[torch.Tensor] = None,
+        data_type: Optional[DataType] = DataType.VIDEO,
+        latent_condition: Optional[torch.Tensor] = None,
+        latent_condition_sigma: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: (B, C, T, H, W) tensor of spatial-temp inputs
+            timesteps: (B, ) tensor of timesteps
+            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
+            crossattn_mask: (B, N) tensor of cross-attention masks
+        """
+        del kwargs
+        assert isinstance(
+            data_type, DataType
+        ), f"Expected DataType, got {type(data_type)}. We need discuss this flag later."
+        original_shape = x.shape
+        x_B_T_H_W_D, rope_emb_L_1_1_D, extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = self.prepare_embedded_sequence(
+            x,
+            fps=fps,
+            padding_mask=padding_mask,
+            latent_condition=latent_condition,
+            latent_condition_sigma=latent_condition_sigma,
+        )
+        # logging affline scale information
+        affline_scale_log_info = {}
+
+        timesteps_B_D, adaln_lora_B_3D = self.t_embedder[1](self.t_embedder[0](timesteps.flatten()).to(x.dtype))
+        affline_emb_B_D = timesteps_B_D
+        affline_scale_log_info["timesteps_B_D"] = timesteps_B_D.detach()
+
+        if scalar_feature is not None:
+            raise NotImplementedError("Scalar feature is not implemented yet.")
+
+        affline_scale_log_info["affline_emb_B_D"] = affline_emb_B_D.detach()
+        affline_emb_B_D = self.affline_norm(affline_emb_B_D)
+
+        if self.use_cross_attn_mask:
+            if crossattn_mask is not None and not torch.is_floating_point(crossattn_mask):
+                crossattn_mask = (crossattn_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
+            crossattn_mask = crossattn_mask[:, None, None, :]  # .to(dtype=torch.bool)  # [B, 1, 1, length]
+        else:
+            crossattn_mask = None
+
+        if self.blocks["block0"].x_format == "THWBD":
+            x = rearrange(x_B_T_H_W_D, "B T H W D -> T H W B D")
+            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
+                extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = rearrange(
+                    extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D, "B T H W D -> T H W B D"
+                )
+            crossattn_emb = rearrange(crossattn_emb, "B M D -> M B D")
+
+            if crossattn_mask:
+                crossattn_mask = rearrange(crossattn_mask, "B M -> M B")
+
+        elif self.blocks["block0"].x_format == "BTHWD":
+            x = x_B_T_H_W_D
+        else:
+            raise ValueError(f"Unknown x_format {self.blocks[0].x_format}")
+        output = {
+            "x": x,
+            "affline_emb_B_D": affline_emb_B_D,
+            "crossattn_emb": crossattn_emb,
+            "crossattn_mask": crossattn_mask,
+            "rope_emb_L_1_1_D": rope_emb_L_1_1_D,
+            "adaln_lora_B_3D": adaln_lora_B_3D,
+            "original_shape": original_shape,
+            "extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
+        }
+        return output
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        # crossattn_emb: torch.Tensor,
+        # crossattn_mask: Optional[torch.Tensor] = None,
+        fps: Optional[torch.Tensor] = None,
+        image_size: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        scalar_feature: Optional[torch.Tensor] = None,
+        data_type: Optional[DataType] = DataType.VIDEO,
+        latent_condition: Optional[torch.Tensor] = None,
+        latent_condition_sigma: Optional[torch.Tensor] = None,
+        condition_video_augment_sigma: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            x: (B, C, T, H, W) tensor of spatial-temp inputs
+            timesteps: (B, ) tensor of timesteps
+            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
+            crossattn_mask: (B, N) tensor of cross-attention masks
+            condition_video_augment_sigma: (B,) used in lvg(long video generation), we add noise with this sigma to
+                augment condition input, the lvg model will condition on the condition_video_augment_sigma value;
+                we need forward_before_blocks pass to the forward_before_blocks function.
+        """
+
+        crossattn_emb = context
+        crossattn_mask = attention_mask
+
+        inputs = self.forward_before_blocks(
+            x=x,
+            timesteps=timesteps,
+            crossattn_emb=crossattn_emb,
+            crossattn_mask=crossattn_mask,
+            fps=fps,
+            image_size=image_size,
+            padding_mask=padding_mask,
+            scalar_feature=scalar_feature,
+            data_type=data_type,
+            latent_condition=latent_condition,
+            latent_condition_sigma=latent_condition_sigma,
+            condition_video_augment_sigma=condition_video_augment_sigma,
+            **kwargs,
+        )
+        x, affline_emb_B_D, crossattn_emb, crossattn_mask, rope_emb_L_1_1_D, adaln_lora_B_3D, original_shape = (
+            inputs["x"],
+            inputs["affline_emb_B_D"],
+            inputs["crossattn_emb"],
+            inputs["crossattn_mask"],
+            inputs["rope_emb_L_1_1_D"],
+            inputs["adaln_lora_B_3D"],
+            inputs["original_shape"],
+        )
+        extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = inputs["extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D"].to(x.dtype)
+        del inputs
+
+        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
+            assert (
+                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
+            ), f"{x.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape} {original_shape}"
+
+        for _, block in self.blocks.items():
+            assert (
+                self.blocks["block0"].x_format == block.x_format
+            ), f"First block has x_format {self.blocks[0].x_format}, got {block.x_format}"
+
+            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
+                x += extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D
+            x = block(
+                x,
+                affline_emb_B_D,
+                crossattn_emb,
+                crossattn_mask,
+                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
+                adaln_lora_B_3D=adaln_lora_B_3D,
+            )
+
+        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
+
+        x_B_D_T_H_W = self.decoder_head(
+            x_B_T_H_W_D=x_B_T_H_W_D,
+            emb_B_D=affline_emb_B_D,
+            crossattn_emb=None,
+            origin_shape=original_shape,
+            crossattn_mask=None,
+            adaln_lora_B_3D=adaln_lora_B_3D,
+        )
+
+        return x_B_D_T_H_W
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@ -0,0 +1,208 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+import math
+
+
+def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0) -> torch.Tensor:
+    """
+    Normalizes the input tensor along specified dimensions such that the average square norm of elements is adjusted.
+
+    Args:
+        x (torch.Tensor): The input tensor to normalize.
+        dim (list, optional): The dimensions over which to normalize. If None, normalizes over all dimensions except the first.
+        eps (float, optional): A small constant to ensure numerical stability during division.
+
+    Returns:
+        torch.Tensor: The normalized tensor.
+    """
+    if dim is None:
+        dim = list(range(1, x.ndim))
+    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
+    norm = torch.add(eps, norm, alpha=math.sqrt(norm.numel() / x.numel()))
+    return x / norm.to(x.dtype)
+
+
+class VideoPositionEmb(nn.Module):
+    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+        """
+        It delegates the embedding generation to generate_embeddings function.
+        """
+        B_T_H_W_C = x_B_T_H_W_C.shape
+        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype)
+
+        return embeddings
+
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None):
+        raise NotImplementedError
+
+
+class VideoRopePosition3DEmb(VideoPositionEmb):
+    def __init__(
+        self,
+        *,  # enforce keyword arguments
+        head_dim: int,
+        len_h: int,
+        len_w: int,
+        len_t: int,
+        base_fps: int = 24,
+        h_extrapolation_ratio: float = 1.0,
+        w_extrapolation_ratio: float = 1.0,
+        t_extrapolation_ratio: float = 1.0,
+        device=None,
+        **kwargs,  # used for compatibility with other positional embeddings; unused in this class
+    ):
+        del kwargs
+        super().__init__()
+        self.register_buffer("seq", torch.arange(max(len_h, len_w, len_t), dtype=torch.float, device=device))
+        self.base_fps = base_fps
+        self.max_h = len_h
+        self.max_w = len_w
+
+        dim = head_dim
+        dim_h = dim // 6 * 2
+        dim_w = dim_h
+        dim_t = dim - 2 * dim_h
+        assert dim == dim_h + dim_w + dim_t, f"bad dim: {dim} != {dim_h} + {dim_w} + {dim_t}"
+        self.register_buffer(
+            "dim_spatial_range",
+            torch.arange(0, dim_h, 2, device=device)[: (dim_h // 2)].float() / dim_h,
+            persistent=False,
+        )
+        self.register_buffer(
+            "dim_temporal_range",
+            torch.arange(0, dim_t, 2, device=device)[: (dim_t // 2)].float() / dim_t,
+            persistent=False,
+        )
+
+        self.h_ntk_factor = h_extrapolation_ratio ** (dim_h / (dim_h - 2))
+        self.w_ntk_factor = w_extrapolation_ratio ** (dim_w / (dim_w - 2))
+        self.t_ntk_factor = t_extrapolation_ratio ** (dim_t / (dim_t - 2))
+
+    def generate_embeddings(
+        self,
+        B_T_H_W_C: torch.Size,
+        fps: Optional[torch.Tensor] = None,
+        h_ntk_factor: Optional[float] = None,
+        w_ntk_factor: Optional[float] = None,
+        t_ntk_factor: Optional[float] = None,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Generate embeddings for the given input size.
+
+        Args:
+            B_T_H_W_C (torch.Size): Input tensor size (Batch, Time, Height, Width, Channels).
+            fps (Optional[torch.Tensor], optional): Frames per second. Defaults to None.
+            h_ntk_factor (Optional[float], optional): Height NTK factor. If None, uses self.h_ntk_factor.
+            w_ntk_factor (Optional[float], optional): Width NTK factor. If None, uses self.w_ntk_factor.
+            t_ntk_factor (Optional[float], optional): Time NTK factor. If None, uses self.t_ntk_factor.
+
+        Returns:
+            Not specified in the original code snippet.
+        """
+        h_ntk_factor = h_ntk_factor if h_ntk_factor is not None else self.h_ntk_factor
+        w_ntk_factor = w_ntk_factor if w_ntk_factor is not None else self.w_ntk_factor
+        t_ntk_factor = t_ntk_factor if t_ntk_factor is not None else self.t_ntk_factor
+
+        h_theta = 10000.0 * h_ntk_factor
+        w_theta = 10000.0 * w_ntk_factor
+        t_theta = 10000.0 * t_ntk_factor
+
+        h_spatial_freqs = 1.0 / (h_theta**self.dim_spatial_range.to(device=device))
+        w_spatial_freqs = 1.0 / (w_theta**self.dim_spatial_range.to(device=device))
+        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))
+
+        B, T, H, W, _ = B_T_H_W_C
+        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
+        assert (
+            uniform_fps or B == 1 or T == 1
+        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
+        assert (
+            H <= self.max_h and W <= self.max_w
+        ), f"Input dimensions (H={H}, W={W}) exceed the maximum dimensions (max_h={self.max_h}, max_w={self.max_w})"
+        half_emb_h = torch.outer(self.seq[:H].to(device=device), h_spatial_freqs)
+        half_emb_w = torch.outer(self.seq[:W].to(device=device), w_spatial_freqs)
+
+        # apply sequence scaling in temporal dimension
+        if fps is None:  # image case
+            half_emb_t = torch.outer(self.seq[:T].to(device=device), temporal_freqs)
+        else:
+            half_emb_t = torch.outer(self.seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
+
+        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
+        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
+        half_emb_t = torch.stack([torch.cos(half_emb_t), -torch.sin(half_emb_t), torch.sin(half_emb_t), torch.cos(half_emb_t)], dim=-1)
+
+        em_T_H_W_D = torch.cat(
+            [
+                repeat(half_emb_t, "t d x -> t h w d x", h=H, w=W),
+                repeat(half_emb_h, "h d x -> t h w d x", t=T, w=W),
+                repeat(half_emb_w, "w d x -> t h w d x", t=T, h=H),
+            ]
+            , dim=-2,
+        )
+
+        return rearrange(em_T_H_W_D, "t h w d (i j) -> (t h w) d i j", i=2, j=2).float()
+
+
+class LearnablePosEmbAxis(VideoPositionEmb):
+    def __init__(
+        self,
+        *,  # enforce keyword arguments
+        interpolation: str,
+        model_channels: int,
+        len_h: int,
+        len_w: int,
+        len_t: int,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            interpolation (str): we curretly only support "crop", ideally when we need extrapolation capacity, we should adjust frequency or other more advanced methods. they are not implemented yet.
+        """
+        del kwargs  # unused
+        super().__init__()
+        self.interpolation = interpolation
+        assert self.interpolation in ["crop"], f"Unknown interpolation method {self.interpolation}"
+
+        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device, dtype=dtype))
+        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype))
+        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype))
+
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+        B, T, H, W, _ = B_T_H_W_C
+        if self.interpolation == "crop":
+            emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype)
+            emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype)
+            emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype)
+            emb = (
+                repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W)
+                + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W)
+                + repeat(emb_w_W, "w d-> b t h w d", b=B, t=T, h=H)
+            )
+            assert list(emb.shape)[:4] == [B, T, H, W], f"bad shape: {list(emb.shape)[:4]} != {B, T, H, W}"
+        else:
+            raise ValueError(f"Unknown interpolation method {self.interpolation}")
+
+        return normalize(emb, dim=-1, eps=1e-6)
--- a/comfy/ldm/cosmos/vae.py
+++ b/comfy/ldm/cosmos/vae.py
@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The causal continuous video tokenizer with VAE or AE formulation for 3D data.."""
+
+import logging
+import torch
+from torch import nn
+from enum import Enum
+import math
+
+from .cosmos_tokenizer.layers3d import (
+    EncoderFactorized,
+    DecoderFactorized,
+    CausalConv3d,
+)
+
+
+class IdentityDistribution(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, parameters):
+        return parameters, (torch.tensor([0.0]), torch.tensor([0.0]))
+
+
+class GaussianDistribution(torch.nn.Module):
+    def __init__(self, min_logvar: float = -30.0, max_logvar: float = 20.0):
+        super().__init__()
+        self.min_logvar = min_logvar
+        self.max_logvar = max_logvar
+
+    def sample(self, mean, logvar):
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
+
+    def forward(self, parameters):
+        mean, logvar = torch.chunk(parameters, 2, dim=1)
+        logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar)
+        return self.sample(mean, logvar), (mean, logvar)
+
+
+class ContinuousFormulation(Enum):
+    VAE = GaussianDistribution
+    AE = IdentityDistribution
+
+
+class CausalContinuousVideoTokenizer(nn.Module):
+    def __init__(
+        self, z_channels: int, z_factor: int, latent_channels: int, **kwargs
+    ) -> None:
+        super().__init__()
+        self.name = kwargs.get("name", "CausalContinuousVideoTokenizer")
+        self.latent_channels = latent_channels
+        self.sigma_data = 0.5
+
+        # encoder_name = kwargs.get("encoder", Encoder3DType.BASE.name)
+        self.encoder = EncoderFactorized(
+            z_channels=z_factor * z_channels, **kwargs
+        )
+        if kwargs.get("temporal_compression", 4) == 4:
+            kwargs["channels_mult"] = [2, 4]
+        # decoder_name = kwargs.get("decoder", Decoder3DType.BASE.name)
+        self.decoder = DecoderFactorized(
+            z_channels=z_channels, **kwargs
+        )
+
+        self.quant_conv = CausalConv3d(
+            z_factor * z_channels,
+            z_factor * latent_channels,
+            kernel_size=1,
+            padding=0,
+        )
+        self.post_quant_conv = CausalConv3d(
+            latent_channels, z_channels, kernel_size=1, padding=0
+        )
+
+        # formulation_name = kwargs.get("formulation", ContinuousFormulation.AE.name)
+        self.distribution = IdentityDistribution()  # ContinuousFormulation[formulation_name].value()
+
+        num_parameters = sum(param.numel() for param in self.parameters())
+        logging.debug(f"model={self.name}, num_parameters={num_parameters:,}")
+        logging.debug(
+            f"z_channels={z_channels}, latent_channels={self.latent_channels}."
+        )
+
+        latent_temporal_chunk = 16
+        self.latent_mean = nn.Parameter(torch.zeros([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
+        self.latent_std = nn.Parameter(torch.ones([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
+
+
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        z, posteriors = self.distribution(moments)
+        latent_ch = z.shape[1]
+        latent_t = z.shape[2]
+        in_dtype = z.dtype
+        mean = self.latent_mean.view(latent_ch, -1)
+        std = self.latent_std.view(latent_ch, -1)
+
+        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        return ((z - mean) / std) * self.sigma_data
+
+    def decode(self, z):
+        in_dtype = z.dtype
+        latent_ch = z.shape[1]
+        latent_t = z.shape[2]
+        mean = self.latent_mean.view(latent_ch, -1)
+        std = self.latent_std.view(latent_ch, -1)
+
+        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
+
+        z = z / self.sigma_data
+        z = z * std + mean
+        z = self.post_quant_conv(z)
+        return self.decoder(z)
+
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@ -6,9 +6,7 @@ import math
 from torch import Tensor, nn
 from einops import rearrange, repeat

-from .layers import (DoubleStreamBlock, EmbedND, LastLayer,
-                                 MLPEmbedder, SingleStreamBlock,
-                                 timestep_embedding)
+from .layers import (timestep_embedding)

 from .model import Flux
 import comfy.ldm.common_dit
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -105,7 +105,9 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
-        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        if vec.ndim == 2:
+            vec = vec[:, None, :]
+        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)

        return (
            ModulationOut(*out[:3]),
@ -113,8 +115,22 @@ class Modulation(nn.Module):
        )


+def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
+    if modulation_dims is None:
+        if m_add is not None:
+            return tensor * m_mult + m_add
+        else:
+            return tensor * m_mult
+    else:
+        for d in modulation_dims:
+            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
+            if m_add is not None:
+                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
+        return tensor
+
+
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
@ -141,39 +157,50 @@ class DoubleStreamBlock(nn.Module):
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
+        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

+        if self.flipped_img_txt:
+            # run actual attention
+            attn = attention(torch.cat((img_q, txt_q), dim=2),
+                             torch.cat((img_k, txt_k), dim=2),
+                             torch.cat((img_v, txt_v), dim=2),
+                             pe=pe, mask=attn_mask)
+
+            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
+        else:
            # run actual attention
            attn = attention(torch.cat((txt_q, img_q), dim=2),
                             torch.cat((txt_k, img_k), dim=2),
-                         torch.cat((txt_v, img_v), dim=2), pe=pe)
+                             torch.cat((txt_v, img_v), dim=2),
+                             pe=pe, mask=attn_mask)

-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]

        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
+        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)

        # calculate the txt bloks
-        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
+        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)

        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@ -217,19 +244,18 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
        mod, _ = self.modulation(vec)
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)

        # compute attention
-        attn = attention(q, k, v, pe=pe)
+        attn = attention(q, k, v, pe=pe, mask=attn_mask)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += mod.gate * output
+        x += apply_mod(output, mod.gate, None, modulation_dims)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
@ -242,8 +268,11 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))

-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
+        if vec.ndim == 2:
+            vec = vec[:, None, :]
+
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
+        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -1,20 +1,29 @@
 import torch
 from einops import rearrange
 from torch import Tensor
+
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management

-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
-    q, k = apply_rope(q, k, pe)
+
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
+    q_shape = q.shape
+    k_shape = k.shape
+
+    if pe is not None:
+        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
+        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
+        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
+        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)

    heads = q.shape[1]
-    x = optimized_attention(q, k, v, heads, skip_reshape=True)
+    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
    return x


 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
        device = torch.device("cpu")
    else:
        device = pos.device
@ -28,8 +37,9 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:


 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -4,6 +4,8 @@ from dataclasses import dataclass

 import torch
 from torch import Tensor, nn
+from einops import rearrange, repeat
+import comfy.ldm.common_dit

 from .layers import (
    DoubleStreamBlock,
@ -14,9 +16,6 @@ from .layers import (
    timestep_embedding,
 )

-from einops import rearrange, repeat
-import comfy.ldm.common_dit
-
@dataclass
 class FluxParams:
    in_channels: int
@ -98,8 +97,9 @@ class Flux(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
-        control=None,
+        control = None,
        transformer_options={},
+        attn_mask: Tensor = None,
    ) -> Tensor:
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
@ -109,29 +109,44 @@ class Flux(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
        if self.params.guidance_embed:
-            if guidance is None:
-                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
        txt = self.txt_in(txt)

+        if img_ids is not None:
            ids = torch.cat((txt_ids, img_ids), dim=1)
            pe = self.pe_embedder(ids)
+        else:
+            pe = None

        blocks_replace = patches_replace.get("dit", {})
        for i, block in enumerate(self.double_blocks):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"])
+                    out["img"], out["txt"] = block(img=args["img"],
+                                                   txt=args["txt"],
+                                                   vec=args["vec"],
+                                                   pe=args["pe"],
+                                                   attn_mask=args.get("attn_mask"))
                    return out

-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe}, {"original_block": block_wrap})
+                out = blocks_replace[("double_block", i)]({"img": img,
+                                                           "txt": txt,
+                                                           "vec": vec,
+                                                           "pe": pe,
+                                                           "attn_mask": attn_mask},
+                                                          {"original_block": block_wrap})
                txt = out["txt"]
                img = out["img"]
            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+                img, txt = block(img=img,
+                                 txt=txt,
+                                 vec=vec,
+                                 pe=pe,
+                                 attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_i = control.get("input")
@ -146,13 +161,20 @@ class Flux(nn.Module):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"])
+                    out["img"] = block(args["img"],
+                                       vec=args["vec"],
+                                       pe=args["pe"],
+                                       attn_mask=args.get("attn_mask"))
                    return out

-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe}, {"original_block": block_wrap})
+                out = blocks_replace[("single_block", i)]({"img": img,
+                                                           "vec": vec,
+                                                           "pe": pe,
+                                                           "attn_mask": attn_mask},
+                                                          {"original_block": block_wrap})
                img = out["img"]
            else:
-                img = block(img, vec=vec, pe=pe)
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)

            if control is not None: # Controlnet
                control_o = control.get("output")
@ -166,7 +188,7 @@ class Flux(nn.Module):
        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def forward(self, x, timestep, context, y, guidance, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance=None, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
        patch_size = self.patch_size
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
@ -181,5 +203,5 @@ class Flux(nn.Module):
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options)
+        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@ -461,8 +461,6 @@ class AsymmDiTJoint(nn.Module):
        pH, pW = H // self.patch_size, W // self.patch_size
        x = self.embed_x(x)  # (B, N, D), where N = T * H * W / patch_size ** 2
        assert x.ndim == 3
-        B = x.size(0)
-

        pH, pW = H // self.patch_size, W // self.patch_size
        N = T * pH * pW
--- a/comfy/ldm/genmo/joint_model/utils.py
+++ b/comfy/ldm/genmo/joint_model/utils.py
@ -1,7 +1,7 @@
 #original code from https://github.com/genmoai/models under apache 2.0 license
 #adapted to ComfyUI

-from typing import Optional, Tuple
+from typing import Optional

 import torch
 import torch.nn as nn
--- a/comfy/ldm/genmo/vae/model.py
+++ b/comfy/ldm/genmo/vae/model.py
@ -1,7 +1,7 @@
 #original code from https://github.com/genmoai/models under apache 2.0 license
 #adapted to ComfyUI

-from typing import Callable, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 from functools import partial
 import math

--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@ -0,0 +1,828 @@
+from typing import Optional, Tuple, List
+
+import torch
+import torch.nn as nn
+import einops
+from einops import repeat
+
+from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
+import torch.nn.functional as F
+
+from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.model_management
+
+# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
+def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+    assert dim % 2 == 0, "The dimension must be even."
+
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+
+    batch_size, seq_length = pos.shape
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    cos_out = torch.cos(out)
+    sin_out = torch.sin(out)
+
+    stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+    out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+    return out.float()
+
+
+# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
+class EmbedND(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(2)
+
+
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size=2,
+        in_channels=4,
+        out_channels=1024,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.out_channels = out_channels
+        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
+
+    def forward(self, latent):
+        latent = self.proj(latent)
+        return latent
+
+
+class PooledEmbed(nn.Module):
+    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, pooled_embed):
+        return self.pooled_embedder(pooled_embed)
+
+
+class TimestepEmbed(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, timesteps, wdtype):
+        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
+        t_emb = self.timestep_embedder(t_emb)
+        return t_emb
+
+
+class OutEmbed(nn.Module):
+    def __init__(self, hidden_size, patch_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device)
+        )
+
+    def forward(self, x, adaln_input):
+        shift, scale = self.adaLN_modulation(adaln_input).chunk(2, dim=1)
+        x = self.norm_final(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        x = self.linear(x)
+        return x
+
+
+def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
+
+
+class HiDreamAttnProcessor_flashattn:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __call__(
+        self,
+        attn,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        dtype = image_tokens.dtype
+        batch_size = image_tokens.shape[0]
+
+        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
+        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
+        value_i = attn.to_v(image_tokens)
+
+        inner_dim = key_i.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
+        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
+        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
+        if image_tokens_masks is not None:
+            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
+
+        if not attn.single:
+            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
+            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
+            value_t = attn.to_v_t(text_tokens)
+
+            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
+            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
+            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
+
+            num_image_tokens = query_i.shape[1]
+            num_text_tokens = query_t.shape[1]
+            query = torch.cat([query_i, query_t], dim=1)
+            key = torch.cat([key_i, key_t], dim=1)
+            value = torch.cat([value_i, value_t], dim=1)
+        else:
+            query = query_i
+            key = key_i
+            value = value_i
+
+        if query.shape[-1] == rope.shape[-3] * 2:
+            query, key = apply_rope(query, key, rope)
+        else:
+            query_1, query_2 = query.chunk(2, dim=-1)
+            key_1, key_2 = key.chunk(2, dim=-1)
+            query_1, key_1 = apply_rope(query_1, key_1, rope)
+            query = torch.cat([query_1, query_2], dim=-1)
+            key = torch.cat([key_1, key_2], dim=-1)
+
+        hidden_states = attention(query, key, value)
+
+        if not attn.single:
+            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
+            hidden_states_i = attn.to_out(hidden_states_i)
+            hidden_states_t = attn.to_out_t(hidden_states_t)
+            return hidden_states_i, hidden_states_t
+        else:
+            hidden_states = attn.to_out(hidden_states)
+            return hidden_states
+
+class HiDreamAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        scale_qk: bool = True,
+        eps: float = 1e-5,
+        processor = None,
+        out_dim: int = None,
+        single: bool = False,
+        dtype=None, device=None, operations=None
+    ):
+        # super(Attention, self).__init__()
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.out_dim = out_dim if out_dim is not None else query_dim
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.sliceable_head_dim = heads
+        self.single = single
+
+        linear_cls = operations.Linear
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
+        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+
+        if not single:
+            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
+            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+
+        self.processor = processor
+
+    def forward(
+        self,
+        norm_image_tokens: torch.FloatTensor,
+        image_tokens_masks: torch.FloatTensor = None,
+        norm_text_tokens: torch.FloatTensor = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            image_tokens = norm_image_tokens,
+            image_tokens_masks = image_tokens_masks,
+            text_tokens = norm_text_tokens,
+            rope = rope,
+        )
+
+
+class FeedForwardSwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+
+        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
+        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
+
+
+# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+class MoEGate(nn.Module):
+    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.top_k = num_activated_experts
+        self.n_routed_experts = num_routed_experts
+
+        self.scoring_func = 'softmax'
+        self.alpha = aux_loss_alpha
+        self.seq_aux = False
+
+        # topk selection algorithm
+        self.norm_topk_prob = False
+        self.gating_dim = embed_dim
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        pass
+        # import torch.nn.init  as init
+        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
+        if self.scoring_func == 'softmax':
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
+
+        ### select top-k experts
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+
+        aux_loss = None
+        return topk_idx, topk_weight, aux_loss
+
+
+# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+class MOEFeedForwardSwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_routed_experts: int,
+        num_activated_experts: int,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
+        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
+        self.gate = MoEGate(
+            embed_dim = dim,
+            num_routed_experts = num_routed_experts,
+            num_activated_experts = num_activated_experts,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.num_activated_experts = num_activated_experts
+
+    def forward(self, x):
+        wtype = x.dtype
+        identity = x
+        orig_shape = x.shape
+        topk_idx, topk_weight, aux_loss = self.gate(x)
+        x = x.view(-1, x.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if True:  # self.training: # TODO: check which branch performs faster
+            x = x.repeat_interleave(self.num_activated_experts, dim=0)
+            y = torch.empty_like(x, dtype=wtype)
+            for i, expert in enumerate(self.experts):
+                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y =  y.view(*orig_shape).to(dtype=wtype)
+            #y = AddAuxiliaryLoss.apply(y, aux_loss)
+        else:
+            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
+        y = y + self.shared_experts(identity)
+        return y
+
+    @torch.no_grad()
+    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
+        token_idxs = idxs // self.num_activated_experts
+        for i, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
+            if start_idx == end_idx:
+                continue
+            expert = self.experts[i]
+            exp_token_idx = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idx]
+            expert_out = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+
+            # for fp16 and other dtype
+            expert_cache = expert_cache.to(expert_out.dtype)
+            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+        return expert_cache
+
+
+class TextProjection(nn.Module):
+    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, caption):
+        hidden_states = self.linear(caption)
+        return hidden_states
+
+
+class BlockType:
+    TransformerBlock = 1
+    SingleTransformerBlock = 2
+
+
+class HiDreamImageSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
+        )
+
+        # 1. Attention
+        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.attn1 = HiDreamAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            processor = HiDreamAttnProcessor_flashattn(),
+            single = True,
+            dtype=dtype, device=device, operations=operations
+        )
+
+        # 3. Feed-forward
+        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        if num_routed_experts > 0:
+            self.ff_i = MOEFeedForwardSwiGLU(
+                dim = dim,
+                hidden_dim = 4 * dim,
+                num_routed_experts = num_routed_experts,
+                num_activated_experts = num_activated_experts,
+                dtype=dtype, device=device, operations=operations
+            )
+        else:
+            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+
+    ) -> torch.FloatTensor:
+        wtype = image_tokens.dtype
+        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
+            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
+
+        # 1. MM-Attention
+        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
+        attn_output_i = self.attn1(
+            norm_image_tokens,
+            image_tokens_masks,
+            rope = rope,
+        )
+        image_tokens = gate_msa_i * attn_output_i + image_tokens
+
+        # 2. Feed-forward
+        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
+        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
+        image_tokens = ff_output_i + image_tokens
+        return image_tokens
+
+
+class HiDreamImageTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
+        )
+        # nn.init.zeros_(self.adaLN_modulation[1].weight)
+        # nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+        # 1. Attention
+        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.attn1 = HiDreamAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            processor = HiDreamAttnProcessor_flashattn(),
+            single = False,
+            dtype=dtype, device=device, operations=operations
+        )
+
+        # 3. Feed-forward
+        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        if num_routed_experts > 0:
+            self.ff_i = MOEFeedForwardSwiGLU(
+                dim = dim,
+                hidden_dim = 4 * dim,
+                num_routed_experts = num_routed_experts,
+                num_activated_experts = num_activated_experts,
+                dtype=dtype, device=device, operations=operations
+            )
+        else:
+            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
+        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        wtype = image_tokens.dtype
+        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
+        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
+            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
+
+        # 1. MM-Attention
+        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
+        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
+        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
+
+        attn_output_i, attn_output_t = self.attn1(
+            norm_image_tokens,
+            image_tokens_masks,
+            norm_text_tokens,
+            rope = rope,
+        )
+
+        image_tokens = gate_msa_i * attn_output_i + image_tokens
+        text_tokens = gate_msa_t * attn_output_t + text_tokens
+
+        # 2. Feed-forward
+        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
+        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
+        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
+
+        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
+        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
+        image_tokens = ff_output_i + image_tokens
+        text_tokens = ff_output_t + text_tokens
+        return image_tokens, text_tokens
+
+
+class HiDreamImageBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        block_type: BlockType = BlockType.TransformerBlock,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        block_classes = {
+            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
+            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
+        }
+        self.block = block_classes[block_type](
+            dim,
+            num_attention_heads,
+            attention_head_dim,
+            num_routed_experts,
+            num_activated_experts,
+            dtype=dtype, device=device, operations=operations
+        )
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: torch.FloatTensor = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        return self.block(
+            image_tokens,
+            image_tokens_masks,
+            text_tokens,
+            adaln_input,
+            rope,
+        )
+
+
+class HiDreamImageTransformer2DModel(nn.Module):
+    def __init__(
+        self,
+        patch_size: Optional[int] = None,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 16,
+        num_single_layers: int = 32,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 20,
+        caption_channels: List[int] = None,
+        text_emb_dim: int = 2048,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        axes_dims_rope: Tuple[int, int] = (32, 32),
+        max_resolution: Tuple[int, int] = (128, 128),
+        llama_layers: List[int] = None,
+        image_model=None,
+        dtype=None, device=None, operations=None
+    ):
+        self.patch_size = patch_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.num_layers = num_layers
+        self.num_single_layers = num_single_layers
+
+        self.gradient_checkpointing = False
+
+        super().__init__()
+        self.dtype = dtype
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = self.num_attention_heads * self.attention_head_dim
+        self.llama_layers = llama_layers
+
+        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.x_embedder = PatchEmbed(
+            patch_size = patch_size,
+            in_channels = in_channels,
+            out_channels = self.inner_dim,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
+
+        self.double_stream_blocks = nn.ModuleList(
+            [
+                HiDreamImageBlock(
+                    dim = self.inner_dim,
+                    num_attention_heads = self.num_attention_heads,
+                    attention_head_dim = self.attention_head_dim,
+                    num_routed_experts = num_routed_experts,
+                    num_activated_experts = num_activated_experts,
+                    block_type = BlockType.TransformerBlock,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        self.single_stream_blocks = nn.ModuleList(
+            [
+                HiDreamImageBlock(
+                    dim = self.inner_dim,
+                    num_attention_heads = self.num_attention_heads,
+                    attention_head_dim = self.attention_head_dim,
+                    num_routed_experts = num_routed_experts,
+                    num_activated_experts = num_activated_experts,
+                    block_type = BlockType.SingleTransformerBlock,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for i in range(self.num_single_layers)
+            ]
+        )
+
+        self.final_layer = OutEmbed(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
+
+        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
+        caption_projection = []
+        for caption_channel in caption_channels:
+            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
+        self.caption_projection = nn.ModuleList(caption_projection)
+        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
+
+    def expand_timesteps(self, timesteps, batch_size, device):
+        if not torch.is_tensor(timesteps):
+            is_mps = device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(batch_size)
+        return timesteps
+
+    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
+        x_arr = []
+        for i, img_size in enumerate(img_sizes):
+            pH, pW = img_size
+            x_arr.append(
+                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
+                    p1=self.patch_size, p2=self.patch_size)
+            )
+        x = torch.cat(x_arr, dim=0)
+        return x
+
+    def patchify(self, x, max_seq, img_sizes=None):
+        pz2 = self.patch_size * self.patch_size
+        if isinstance(x, torch.Tensor):
+            B = x.shape[0]
+            device = x.device
+            dtype = x.dtype
+        else:
+            B = len(x)
+            device = x[0].device
+            dtype = x[0].dtype
+        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
+
+        if img_sizes is not None:
+            for i, img_size in enumerate(img_sizes):
+                x_masks[i, 0:img_size[0] * img_size[1]] = 1
+            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
+        elif isinstance(x, torch.Tensor):
+            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
+            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
+            img_sizes = [[pH, pW]] * B
+            x_masks = None
+        else:
+            raise NotImplementedError
+        return x, x_masks, img_sizes
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        encoder_hidden_states_llama3=None,
+        control = None,
+        transformer_options = {},
+    ) -> torch.Tensor:
+        hidden_states = x
+        timesteps = t
+        pooled_embeds = y
+        T5_encoder_hidden_states = context
+
+        img_sizes = None
+
+        # spatial forward
+        batch_size = hidden_states.shape[0]
+        hidden_states_type = hidden_states.dtype
+
+        # 0. time
+        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
+        timesteps = self.t_embedder(timesteps, hidden_states_type)
+        p_embedder = self.p_embedder(pooled_embeds)
+        adaln_input = timesteps + p_embedder
+
+        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
+        if image_tokens_masks is None:
+            pH, pW = img_sizes[0]
+            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
+            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
+            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
+            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+        hidden_states = self.x_embedder(hidden_states)
+
+        # T5_encoder_hidden_states = encoder_hidden_states[0]
+        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
+        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
+
+        if self.caption_projection is not None:
+            new_encoder_hidden_states = []
+            for i, enc_hidden_state in enumerate(encoder_hidden_states):
+                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
+                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
+                new_encoder_hidden_states.append(enc_hidden_state)
+            encoder_hidden_states = new_encoder_hidden_states
+            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
+            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+            encoder_hidden_states.append(T5_encoder_hidden_states)
+
+        txt_ids = torch.zeros(
+            batch_size,
+            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
+            3,
+            device=img_ids.device, dtype=img_ids.dtype
+        )
+        ids = torch.cat((img_ids, txt_ids), dim=1)
+        rope = self.pe_embedder(ids)
+
+        # 2. Blocks
+        block_id = 0
+        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
+        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
+        for bid, block in enumerate(self.double_stream_blocks):
+            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
+            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
+            hidden_states, initial_encoder_hidden_states = block(
+                image_tokens = hidden_states,
+                image_tokens_masks = image_tokens_masks,
+                text_tokens = cur_encoder_hidden_states,
+                adaln_input = adaln_input,
+                rope = rope,
+            )
+            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
+            block_id += 1
+
+        image_tokens_seq_len = hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
+        hidden_states_seq_len = hidden_states.shape[1]
+        if image_tokens_masks is not None:
+            encoder_attention_mask_ones = torch.ones(
+                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
+                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
+            )
+            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
+
+        for bid, block in enumerate(self.single_stream_blocks):
+            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
+            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
+            hidden_states = block(
+                image_tokens=hidden_states,
+                image_tokens_masks=image_tokens_masks,
+                text_tokens=None,
+                adaln_input=adaln_input,
+                rope=rope,
+            )
+            hidden_states = hidden_states[:, :hidden_states_seq_len]
+            block_id += 1
+
+        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
+        output = self.final_layer(hidden_states, adaln_input)
+        output = self.unpatchify(output, img_sizes)
+        return -output
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@ -0,0 +1,135 @@
+import torch
+from torch import nn
+from comfy.ldm.flux.layers import (
+    DoubleStreamBlock,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+
+
+class Hunyuan3Dv2(nn.Module):
+    def __init__(
+        self,
+        in_channels=64,
+        context_in_dim=1536,
+        hidden_size=1024,
+        mlp_ratio=4.0,
+        num_heads=16,
+        depth=16,
+        depth_single_blocks=32,
+        qkv_bias=True,
+        guidance_embed=False,
+        image_model=None,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if hidden_size % num_heads != 0:
+            raise ValueError(
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
+            )
+
+        self.max_period = 1000  # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
+        self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
+        )
+        self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
+        x = x.movedim(-1, -2)
+        timestep = 1.0 - timestep
+        txt = context
+        img = self.latent_in(x)
+
+        vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
+        if self.guidance_in is not None:
+            if guidance is not None:
+                vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
+
+        txt = self.cond_in(txt)
+        pe = None
+        attn_mask = None
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.double_blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"], out["txt"] = block(img=args["img"],
+                                                   txt=args["txt"],
+                                                   vec=args["vec"],
+                                                   pe=args["pe"],
+                                                   attn_mask=args.get("attn_mask"))
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": img,
+                                                           "txt": txt,
+                                                           "vec": vec,
+                                                           "pe": pe,
+                                                           "attn_mask": attn_mask},
+                                                          {"original_block": block_wrap})
+                txt = out["txt"]
+                img = out["img"]
+            else:
+                img, txt = block(img=img,
+                                 txt=txt,
+                                 vec=vec,
+                                 pe=pe,
+                                 attn_mask=attn_mask)
+
+        img = torch.cat((txt, img), 1)
+
+        for i, block in enumerate(self.single_blocks):
+            if ("single_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"],
+                                       vec=args["vec"],
+                                       pe=args["pe"],
+                                       attn_mask=args.get("attn_mask"))
+                    return out
+
+                out = blocks_replace[("single_block", i)]({"img": img,
+                                                           "vec": vec,
+                                                           "pe": pe,
+                                                           "attn_mask": attn_mask},
+                                                          {"original_block": block_wrap})
+                img = out["img"]
+            else:
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
+
+        img = img[:, txt.shape[1]:, ...]
+        img = self.final_layer(img, vec)
+        return img.movedim(-2, -1) * (-1.0)
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@ -0,0 +1,587 @@
+# Original: https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/models/autoencoders/model.py
+# Since the header on their VAE source file was a bit confusing we asked for permission to use this code from tencent under the GPL license used in ComfyUI.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+from typing import Union, Tuple, List, Callable, Optional
+
+import numpy as np
+from einops import repeat, rearrange
+from tqdm import tqdm
+import logging
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+def generate_dense_grid_points(
+    bbox_min: np.ndarray,
+    bbox_max: np.ndarray,
+    octree_resolution: int,
+    indexing: str = "ij",
+):
+    length = bbox_max - bbox_min
+    num_cells = octree_resolution
+
+    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
+    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
+    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyz = np.stack((xs, ys, zs), axis=-1)
+    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+
+    return xyz, grid_size, length
+
+
+class VanillaVolumeDecoder:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        octree_resolution: int = None,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+        batch_size = latents.shape[0]
+
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+
+        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=octree_resolution,
+            indexing="ij"
+        )
+        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
+
+        # 2. latents to 3d volume
+        batch_logits = []
+        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
+                          disable=not enable_pbar):
+            chunk_queries = xyz_samples[start: start + num_chunks, :]
+            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
+            logits = geo_decoder(queries=chunk_queries, latents=latents)
+            batch_logits.append(logits)
+
+        grid_logits = torch.cat(batch_logits, dim=1)
+        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
+
+        return grid_logits
+
+
+class FourierEmbedder(nn.Module):
+    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
+    each feature dimension of `x[..., i]` into:
+        [
+            sin(x[..., i]),
+            sin(f_1*x[..., i]),
+            sin(f_2*x[..., i]),
+            ...
+            sin(f_N * x[..., i]),
+            cos(x[..., i]),
+            cos(f_1*x[..., i]),
+            cos(f_2*x[..., i]),
+            ...
+            cos(f_N * x[..., i]),
+            x[..., i]     # only present if include_input is True.
+        ], here f_i is the frequency.
+
+    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
+    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
+    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
+
+    Args:
+        num_freqs (int): the number of frequencies, default is 6;
+        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
+            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
+        input_dim (int): the input dimension, default is 3;
+        include_input (bool): include the input tensor or not, default is True.
+
+    Attributes:
+        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
+                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
+
+        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
+            otherwise, it is input_dim * num_freqs * 2.
+
+    """
+
+    def __init__(self,
+                 num_freqs: int = 6,
+                 logspace: bool = True,
+                 input_dim: int = 3,
+                 include_input: bool = True,
+                 include_pi: bool = True) -> None:
+
+        """The initialization"""
+
+        super().__init__()
+
+        if logspace:
+            frequencies = 2.0 ** torch.arange(
+                num_freqs,
+                dtype=torch.float32
+            )
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (num_freqs - 1),
+                num_freqs,
+                dtype=torch.float32
+            )
+
+        if include_pi:
+            frequencies *= torch.pi
+
+        self.register_buffer("frequencies", frequencies, persistent=False)
+        self.include_input = include_input
+        self.num_freqs = num_freqs
+
+        self.out_dim = self.get_dims(input_dim)
+
+    def get_dims(self, input_dim):
+        temp = 1 if self.include_input or self.num_freqs == 0 else 0
+        out_dim = input_dim * (self.num_freqs * 2 + temp)
+
+        return out_dim
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Forward process.
+
+        Args:
+            x: tensor of shape [..., dim]
+
+        Returns:
+            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
+                where temp is 1 if include_input is True and 0 otherwise.
+        """
+
+        if self.num_freqs > 0:
+            embed = (x[..., None].contiguous() * self.frequencies.to(device=x.device, dtype=x.dtype)).view(*x.shape[:-1], -1)
+            if self.include_input:
+                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
+            else:
+                return torch.cat((embed.sin(), embed.cos()), dim=-1)
+        else:
+            return x
+
+
+class CrossAttentionProcessor:
+    def __call__(self, attn, q, k, v):
+        out = F.scaled_dot_product_attention(q, k, v)
+        return out
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+        'survival rate' as the argument.
+
+        """
+        if self.drop_prob == 0. or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
+
+
+class MLP(nn.Module):
+    def __init__(
+        self, *,
+        width: int,
+        expand_ratio: int = 4,
+        output_width: int = None,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.width = width
+        self.c_fc = ops.Linear(width, width * expand_ratio)
+        self.c_proj = ops.Linear(width * expand_ratio, output_width if output_width is not None else width)
+        self.gelu = nn.GELU()
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x):
+        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
+
+
+class QKVMultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        width=None,
+        qk_norm=False,
+        norm_layer=ops.LayerNorm
+    ):
+        super().__init__()
+        self.heads = heads
+        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+
+        self.attn_processor = CrossAttentionProcessor()
+
+    def forward(self, q, kv):
+        _, n_ctx, _ = q.shape
+        bs, n_data, width = kv.shape
+        attn_ch = width // self.heads // 2
+        q = q.view(bs, n_ctx, self.heads, -1)
+        kv = kv.view(bs, n_data, self.heads, -1)
+        k, v = torch.split(kv, attn_ch, dim=-1)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
+        out = self.attn_processor(self, q, k, v)
+        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
+        return out
+
+
+class MultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        qkv_bias: bool = True,
+        data_width: Optional[int] = None,
+        norm_layer=ops.LayerNorm,
+        qk_norm: bool = False,
+        kv_cache: bool = False,
+    ):
+        super().__init__()
+        self.width = width
+        self.heads = heads
+        self.data_width = width if data_width is None else data_width
+        self.c_q = ops.Linear(width, width, bias=qkv_bias)
+        self.c_kv = ops.Linear(self.data_width, width * 2, bias=qkv_bias)
+        self.c_proj = ops.Linear(width, width)
+        self.attention = QKVMultiheadCrossAttention(
+            heads=heads,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.kv_cache = kv_cache
+        self.data = None
+
+    def forward(self, x, data):
+        x = self.c_q(x)
+        if self.kv_cache:
+            if self.data is None:
+                self.data = self.c_kv(data)
+                logging.info('Save kv cache,this should be called only once for one mesh')
+            data = self.data
+        else:
+            data = self.c_kv(data)
+        x = self.attention(x, data)
+        x = self.c_proj(x)
+        return x
+
+
+class ResidualCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        mlp_expand_ratio: int = 4,
+        data_width: Optional[int] = None,
+        qkv_bias: bool = True,
+        norm_layer=ops.LayerNorm,
+        qk_norm: bool = False
+    ):
+        super().__init__()
+
+        if data_width is None:
+            data_width = width
+
+        self.attn = MultiheadCrossAttention(
+            width=width,
+            heads=heads,
+            data_width=data_width,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
+        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
+
+    def forward(self, x: torch.Tensor, data: torch.Tensor):
+        x = x + self.attn(self.ln_1(x), self.ln_2(data))
+        x = x + self.mlp(self.ln_3(x))
+        return x
+
+
+class QKVMultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        width=None,
+        qk_norm=False,
+        norm_layer=ops.LayerNorm
+    ):
+        super().__init__()
+        self.heads = heads
+        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
+
+    def forward(self, qkv):
+        bs, n_ctx, width = qkv.shape
+        attn_ch = width // self.heads // 3
+        qkv = qkv.view(bs, n_ctx, self.heads, -1)
+        q, k, v = torch.split(qkv, attn_ch, dim=-1)
+
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
+        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
+        return out
+
+
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        qkv_bias: bool,
+        norm_layer=ops.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.width = width
+        self.heads = heads
+        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
+        self.c_proj = ops.Linear(width, width)
+        self.attention = QKVMultiheadAttention(
+            heads=heads,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm
+        )
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x):
+        x = self.c_qkv(x)
+        x = self.attention(x)
+        x = self.drop_path(self.c_proj(x))
+        return x
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        qkv_bias: bool = True,
+        norm_layer=ops.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.attn = MultiheadAttention(
+            width=width,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm,
+            drop_path_rate=drop_path_rate
+        )
+        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
+        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        layers: int,
+        heads: int,
+        qkv_bias: bool = True,
+        norm_layer=ops.LayerNorm,
+        qk_norm: bool = False,
+        drop_path_rate: float = 0.0
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    width=width,
+                    heads=heads,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                    qk_norm=qk_norm,
+                    drop_path_rate=drop_path_rate
+                )
+                for _ in range(layers)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor):
+        for block in self.resblocks:
+            x = block(x)
+        return x
+
+
+class CrossAttentionDecoder(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        out_channels: int,
+        fourier_embedder: FourierEmbedder,
+        width: int,
+        heads: int,
+        mlp_expand_ratio: int = 4,
+        downsample_ratio: int = 1,
+        enable_ln_post: bool = True,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        label_type: str = "binary"
+    ):
+        super().__init__()
+
+        self.enable_ln_post = enable_ln_post
+        self.fourier_embedder = fourier_embedder
+        self.downsample_ratio = downsample_ratio
+        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
+        if self.downsample_ratio != 1:
+            self.latents_proj = ops.Linear(width * downsample_ratio, width)
+        if self.enable_ln_post == False:
+            qk_norm = False
+        self.cross_attn_decoder = ResidualCrossAttentionBlock(
+            width=width,
+            mlp_expand_ratio=mlp_expand_ratio,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm
+        )
+
+        if self.enable_ln_post:
+            self.ln_post = ops.LayerNorm(width)
+        self.output_proj = ops.Linear(width, out_channels)
+        self.label_type = label_type
+        self.count = 0
+
+    def forward(self, queries=None, query_embeddings=None, latents=None):
+        if query_embeddings is None:
+            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
+        self.count += query_embeddings.shape[1]
+        if self.downsample_ratio != 1:
+            latents = self.latents_proj(latents)
+        x = self.cross_attn_decoder(query_embeddings, latents)
+        if self.enable_ln_post:
+            x = self.ln_post(x)
+        occ = self.output_proj(x)
+        return occ
+
+
+class ShapeVAE(nn.Module):
+    def __init__(
+        self,
+        *,
+        embed_dim: int,
+        width: int,
+        heads: int,
+        num_decoder_layers: int,
+        geo_decoder_downsample_ratio: int = 1,
+        geo_decoder_mlp_expand_ratio: int = 4,
+        geo_decoder_ln_post: bool = True,
+        num_freqs: int = 8,
+        include_pi: bool = True,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        label_type: str = "binary",
+        drop_path_rate: float = 0.0,
+        scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        self.geo_decoder_ln_post = geo_decoder_ln_post
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+
+        self.post_kl = ops.Linear(embed_dim, width)
+
+        self.transformer = Transformer(
+            width=width,
+            layers=num_decoder_layers,
+            heads=heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            drop_path_rate=drop_path_rate
+        )
+
+        self.geo_decoder = CrossAttentionDecoder(
+            fourier_embedder=self.fourier_embedder,
+            out_channels=1,
+            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
+            downsample_ratio=geo_decoder_downsample_ratio,
+            enable_ln_post=self.geo_decoder_ln_post,
+            width=width // geo_decoder_downsample_ratio,
+            heads=heads // geo_decoder_downsample_ratio,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            label_type=label_type,
+        )
+
+        self.volume_decoder = VanillaVolumeDecoder()
+        self.scale_factor = scale_factor
+
+    def decode(self, latents, **kwargs):
+        latents = self.post_kl(latents.movedim(-2, -1))
+        latents = self.transformer(latents)
+
+        bounds = kwargs.get("bounds", 1.01)
+        num_chunks = kwargs.get("num_chunks", 8000)
+        octree_resolution = kwargs.get("octree_resolution", 256)
+        enable_pbar = kwargs.get("enable_pbar", True)
+
+        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
+        return grid_logits.movedim(-2, -1)
+
+    def encode(self, x):
+        return None
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -0,0 +1,340 @@
+#Based on Flux code because of weird hunyuan video code license.
+
+import torch
+import comfy.ldm.flux.layers
+import comfy.ldm.modules.diffusionmodules.mmdit
+from comfy.ldm.modules.attention import optimized_attention
+
+
+from dataclasses import dataclass
+from einops import repeat
+
+from torch import Tensor, nn
+
+from comfy.ldm.flux.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding
+)
+
+import comfy.ldm.common_dit
+
+
+@dataclass
+class HunyuanVideoParams:
+    in_channels: int
+    out_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list
+    theta: int
+    patch_size: list
+    qkv_bias: bool
+    guidance_embed: bool
+
+
+class SelfAttentionRef(nn.Module):
+    def __init__(self, dim: int, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+
+class TokenRefinerBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+        self.heads = heads
+        mlp_hidden_dim = hidden_size * 4
+
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+        self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
+        self.self_attn = SelfAttentionRef(hidden_size, True, dtype=dtype, device=device, operations=operations)
+
+        self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
+
+        self.mlp = nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+
+    def forward(self, x, c, mask):
+        mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1)
+
+        norm_x = self.norm1(x)
+        qkv = self.self_attn.qkv(norm_x)
+        q, k, v = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, self.heads, -1).permute(2, 0, 3, 1, 4)
+        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True)
+
+        x = x + self.self_attn.proj(attn) * mod1.unsqueeze(1)
+        x = x + self.mlp(self.norm2(x)) * mod2.unsqueeze(1)
+        return x
+
+
+class IndividualTokenRefiner(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        heads,
+        num_blocks,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [
+                TokenRefinerBlock(
+                    hidden_size=hidden_size,
+                    heads=heads,
+                    dtype=dtype,
+                    device=device,
+                    operations=operations
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+
+    def forward(self, x, c, mask):
+        m = None
+        if mask is not None:
+            m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1)
+            m = m + m.transpose(2, 3)
+
+        for block in self.blocks:
+            x = block(x, c, m)
+        return x
+
+
+
+class TokenRefiner(nn.Module):
+    def __init__(
+        self,
+        text_dim,
+        hidden_size,
+        heads,
+        num_blocks,
+        dtype=None,
+        device=None,
+        operations=None
+    ):
+        super().__init__()
+
+        self.input_embedder = operations.Linear(text_dim, hidden_size, bias=True, dtype=dtype, device=device)
+        self.t_embedder = MLPEmbedder(256, hidden_size, dtype=dtype, device=device, operations=operations)
+        self.c_embedder = MLPEmbedder(text_dim, hidden_size, dtype=dtype, device=device, operations=operations)
+        self.individual_token_refiner = IndividualTokenRefiner(hidden_size, heads, num_blocks, dtype=dtype, device=device, operations=operations)
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        mask,
+    ):
+        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
+        # m = mask.float().unsqueeze(-1)
+        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
+        c = x.sum(dim=1) / x.shape[1]
+
+        c = t + self.c_embedder(c.to(x.dtype))
+        x = self.input_embedder(x)
+        x = self.individual_token_refiner(x, c, mask)
+        return x
+
+class HunyuanVideo(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
+        super().__init__()
+        self.dtype = dtype
+        params = HunyuanVideoParams(**kwargs)
+        self.params = params
+        self.patch_size = params.patch_size
+        self.in_channels = params.in_channels
+        self.out_channels = params.out_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+
+        self.img_in = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(None, self.patch_size, self.in_channels, self.hidden_size, conv3d=True, dtype=dtype, device=device, operations=operations)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+        )
+
+        self.txt_in = TokenRefiner(params.context_in_dim, self.hidden_size, self.num_heads, 2, dtype=dtype, device=device, operations=operations)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                    flipped_img_txt=True,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        if final_layer:
+            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
+
+    def forward_orig(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        txt_mask: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor = None,
+        guiding_frame_index=None,
+        control=None,
+        transformer_options={},
+    ) -> Tensor:
+        patches_replace = transformer_options.get("patches_replace", {})
+
+        initial_shape = list(img.shape)
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
+
+        if guiding_frame_index is not None:
+            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
+            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
+            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
+            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
+            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
+            modulation_dims_txt = [(0, None, 1)]
+        else:
+            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+            modulation_dims = None
+            modulation_dims_txt = None
+
+        if self.params.guidance_embed:
+            if guidance is not None:
+                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
+
+        if txt_mask is not None and not torch.is_floating_point(txt_mask):
+            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max
+
+        txt = self.txt_in(txt, timesteps, txt_mask)
+
+        ids = torch.cat((img_ids, txt_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        img_len = img.shape[1]
+        if txt_mask is not None:
+            attn_mask_len = img_len + txt.shape[1]
+            attn_mask = torch.zeros((1, 1, attn_mask_len), dtype=img.dtype, device=img.device)
+            attn_mask[:, 0, img_len:] = txt_mask
+        else:
+            attn_mask = None
+
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.double_blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
+                    return out
+
+                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
+                txt = out["txt"]
+                img = out["img"]
+            else:
+                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
+
+            if control is not None: # Controlnet
+                control_i = control.get("input")
+                if i < len(control_i):
+                    add = control_i[i]
+                    if add is not None:
+                        img += add
+
+        img = torch.cat((img, txt), 1)
+
+        for i, block in enumerate(self.single_blocks):
+            if ("single_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
+                    return out
+
+                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
+                img = out["img"]
+            else:
+                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
+
+            if control is not None: # Controlnet
+                control_o = control.get("output")
+                if i < len(control_o):
+                    add = control_o[i]
+                    if add is not None:
+                        img[:, : img_len] += add
+
+        img = img[:, : img_len]
+
+        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
+
+        shape = initial_shape[-3:]
+        for i in range(len(shape)):
+            shape[i] = shape[i] // self.patch_size[i]
+        img = img.reshape([img.shape[0]] + shape + [self.out_channels] + self.patch_size)
+        img = img.permute(0, 4, 1, 5, 2, 6, 3, 7)
+        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
+        return img
+
+    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, control=None, transformer_options={}, **kwargs):
+        bs, c, t, h, w = x.shape
+        patch_size = self.patch_size
+        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
+        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
+        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
+        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
+        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
+        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
+        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
+        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, control, transformer_options)
+        return out
--- a/comfy/ldm/hydit/controlnet.py
+++ b/comfy/ldm/hydit/controlnet.py
@ -1,24 +1,17 @@
-from typing import Any, Optional

 import torch
 import torch.nn as nn
-import torch.nn.functional as F

-from torch.utils import checkpoint

 from comfy.ldm.modules.diffusionmodules.mmdit import (
-    Mlp,
    TimestepEmbedder,
    PatchEmbed,
-    RMSNorm,
 )
-from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from .poolers import AttentionPool

 import comfy.latent_formats
 from .models import HunYuanDiTBlock, calc_rope

-from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop


 class HunYuanControlNet(nn.Module):
@ -171,9 +164,6 @@ class HunYuanControlNet(nn.Module):
            ),
        )

-        # Image embedding
-        num_patches = self.x_embedder.num_patches
-
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList(
            [
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@ -1,8 +1,6 @@
-from typing import Any

 import torch
 import torch.nn as nn
-import torch.nn.functional as F

 import comfy.ops
 from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
@ -250,9 +248,6 @@ class HunYuanDiT(nn.Module):
            operations.Linear(hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device),
        )

-        # Image embedding
-        num_patches = self.x_embedder.num_patches
-
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList([
            HunYuanDiTBlock(hidden_size=hidden_size,
--- a/comfy/ldm/hydit/poolers.py
+++ b/comfy/ldm/hydit/poolers.py
@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ops

--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@ -7,7 +7,7 @@ from einops import rearrange
 import math
 from typing import Dict, Optional, Tuple

-from .symmetric_patchifier import SymmetricPatchifier
+from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords


 def get_timestep_embedding(
@ -377,11 +377,16 @@ class LTXVModel(torch.nn.Module):

                 positional_embedding_theta=10000.0,
                 positional_embedding_max_pos=[20, 2048, 2048],
+                 causal_temporal_positioning=False,
+                 vae_scale_factors=(8, 32, 32),
                 dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
+        self.generator = None
+        self.vae_scale_factors = vae_scale_factors
        self.dtype = dtype
        self.out_channels = in_channels
        self.inner_dim = num_attention_heads * attention_head_dim
+        self.causal_temporal_positioning = causal_temporal_positioning

        self.patchify_proj = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)

@ -415,39 +420,31 @@ class LTXVModel(torch.nn.Module):

        self.patchifier = SymmetricPatchifier(1)

-    def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, attention_mask, frame_rate=25, transformer_options={}, keyframe_idxs=None, **kwargs):
        patches_replace = transformer_options.get("patches_replace", {})

-        indices_grid = self.patchifier.get_grid(
-            orig_num_frames=x.shape[2],
-            orig_height=x.shape[3],
-            orig_width=x.shape[4],
-            batch_size=x.shape[0],
-            scale_grid=((1 / frame_rate) * 8, 32, 32),
-            device=x.device,
-        )
-
-        if guiding_latent is not None:
-            ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
-            input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
-            ts *= input_ts
-            ts[:, :, 0] = 0.0
-            timestep = self.patchifier.patchify(ts)
-            input_x = x.clone()
-            x[:, :, 0] = guiding_latent[:, :, 0]
-
        orig_shape = list(x.shape)

-        x = self.patchifier.patchify(x)
+        x, latent_coords = self.patchifier.patchify(x)
+        pixel_coords = latent_to_pixel_coords(
+            latent_coords=latent_coords,
+            scale_factors=self.vae_scale_factors,
+            causal_fix=self.causal_temporal_positioning,
+        )
+
+        if keyframe_idxs is not None:
+            pixel_coords[:, :, -keyframe_idxs.shape[2]:] = keyframe_idxs
+
+        fractional_coords = pixel_coords.to(torch.float32)
+        fractional_coords[:, 0] = fractional_coords[:, 0] * (1.0 / frame_rate)

        x = self.patchify_proj(x)
        timestep = timestep * 1000.0

-        attention_mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1]))
-        attention_mask = attention_mask.masked_fill(attention_mask.to(torch.bool), float("-inf"))  # not sure about this
-        # attention_mask = (context != 0).any(dim=2).to(dtype=x.dtype)
+        if attention_mask is not None and not torch.is_floating_point(attention_mask):
+            attention_mask = (attention_mask - 1).to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])) * torch.finfo(x.dtype).max

-        pe = precompute_freqs_cis(indices_grid, dim=self.inner_dim, out_dtype=x.dtype)
+        pe = precompute_freqs_cis(fractional_coords, dim=self.inner_dim, out_dtype=x.dtype)

        batch_size = x.shape[0]
        timestep, embedded_timestep = self.adaln_single(
@ -507,8 +504,4 @@ class LTXVModel(torch.nn.Module):
            out_channels=orig_shape[1] // math.prod(self.patchifier.patch_size),
        )

-        if guiding_latent is not None:
-            x[:, :, 0] = (input_x[:, :, 0] - guiding_latent[:, :, 0]) / input_ts[:, :, 0]
-
-        # print("res", x)
        return x
--- a/comfy/ldm/lightricks/symmetric_patchifier.py
+++ b/comfy/ldm/lightricks/symmetric_patchifier.py
@ -6,16 +6,29 @@ from einops import rearrange
 from torch import Tensor


-def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
-    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
-    dims_to_append = target_dims - x.ndim
-    if dims_to_append < 0:
-        raise ValueError(
-            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+def latent_to_pixel_coords(
+    latent_coords: Tensor, scale_factors: Tuple[int, int, int], causal_fix: bool = False
+) -> Tensor:
+    """
+    Converts latent coordinates to pixel coordinates by scaling them according to the VAE's
+    configuration.
+    Args:
+        latent_coords (Tensor): A tensor of shape [batch_size, 3, num_latents]
+        containing the latent corner coordinates of each token.
+        scale_factors (Tuple[int, int, int]): The scale factors of the VAE's latent space.
+        causal_fix (bool): Whether to take into account the different temporal scale
+            of the first frame. Default = False for backwards compatibility.
+    Returns:
+        Tensor: A tensor of pixel coordinates corresponding to the input latent coordinates.
+    """
+    pixel_coords = (
+        latent_coords
+        * torch.tensor(scale_factors, device=latent_coords.device)[None, :, None]
    )
-    elif dims_to_append == 0:
-        return x
-    return x[(...,) + (None,) * dims_to_append]
+    if causal_fix:
+        # Fix temporal scale for first frame to 1 due to causality
+        pixel_coords[:, 0] = (pixel_coords[:, 0] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords


 class Patchifier(ABC):
@ -44,29 +57,26 @@ class Patchifier(ABC):
    def patch_size(self):
        return self._patch_size

-    def get_grid(
-        self, orig_num_frames, orig_height, orig_width, batch_size, scale_grid, device
+    def get_latent_coords(
+        self, latent_num_frames, latent_height, latent_width, batch_size, device
    ):
-        f = orig_num_frames // self._patch_size[0]
-        h = orig_height // self._patch_size[1]
-        w = orig_width // self._patch_size[2]
-        grid_h = torch.arange(h, dtype=torch.float32, device=device)
-        grid_w = torch.arange(w, dtype=torch.float32, device=device)
-        grid_f = torch.arange(f, dtype=torch.float32, device=device)
-        grid = torch.meshgrid(grid_f, grid_h, grid_w)
-        grid = torch.stack(grid, dim=0)
-        grid = grid.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
-
-        if scale_grid is not None:
-            for i in range(3):
-                if isinstance(scale_grid[i], Tensor):
-                    scale = append_dims(scale_grid[i], grid.ndim - 1)
-                else:
-                    scale = scale_grid[i]
-                grid[:, i, ...] = grid[:, i, ...] * scale * self._patch_size[i]
-
-        grid = rearrange(grid, "b c f h w -> b c (f h w)", b=batch_size)
-        return grid
+        """
+        Return a tensor of shape [batch_size, 3, num_patches] containing the
+            top-left corner latent coordinates of each latent patch.
+        The tensor is repeated for each batch element.
+        """
+        latent_sample_coords = torch.meshgrid(
+            torch.arange(0, latent_num_frames, self._patch_size[0], device=device),
+            torch.arange(0, latent_height, self._patch_size[1], device=device),
+            torch.arange(0, latent_width, self._patch_size[2], device=device),
+            indexing="ij",
+        )
+        latent_sample_coords = torch.stack(latent_sample_coords, dim=0)
+        latent_coords = latent_sample_coords.unsqueeze(0).repeat(batch_size, 1, 1, 1, 1)
+        latent_coords = rearrange(
+            latent_coords, "b c f h w -> b c (f h w)", b=batch_size
+        )
+        return latent_coords


 class SymmetricPatchifier(Patchifier):
@ -74,6 +84,8 @@ class SymmetricPatchifier(Patchifier):
        self,
        latents: Tensor,
    ) -> Tuple[Tensor, Tensor]:
+        b, _, f, h, w = latents.shape
+        latent_coords = self.get_latent_coords(f, h, w, b, latents.device)
        latents = rearrange(
            latents,
            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
@ -81,7 +93,7 @@ class SymmetricPatchifier(Patchifier):
            p2=self._patch_size[1],
            p3=self._patch_size[2],
        )
-        return latents
+        return latents, latent_coords

    def unpatchify(
        self,
--- a/comfy/ldm/lightricks/vae/causal_conv3d.py
+++ b/comfy/ldm/lightricks/vae/causal_conv3d.py
@ -15,6 +15,7 @@ class CausalConv3d(nn.Module):
        stride: Union[int, Tuple[int]] = 1,
        dilation: int = 1,
        groups: int = 1,
+        spatial_padding_mode: str = "zeros",
        **kwargs,
    ):
        super().__init__()
@ -38,7 +39,7 @@ class CausalConv3d(nn.Module):
            stride=stride,
            dilation=dilation,
            padding=padding,
-            padding_mode="zeros",
+            padding_mode=spatial_padding_mode,
            groups=groups,
        )

--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -1,12 +1,16 @@
+from __future__ import annotations
 import torch
 from torch import nn
 from functools import partial
 import math
 from einops import rearrange
-from typing import Any, Mapping, Optional, Tuple, Union, List
+from typing import List, Optional, Tuple, Union
 from .conv_nd_factory import make_conv_nd, make_linear_nd
 from .pixel_norm import PixelNorm
+from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
+import comfy.ops

+ops = comfy.ops.disable_weight_init

 class Encoder(nn.Module):
    r"""
@ -30,7 +34,7 @@ class Encoder(nn.Module):
        norm_layer (`str`, *optional*, defaults to `group_norm`):
            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
        latent_log_var (`str`, *optional*, defaults to `per_channel`):
-            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, `constant` or `none`.
    """

    def __init__(
@ -38,12 +42,13 @@ class Encoder(nn.Module):
        dims: Union[int, Tuple[int, int]] = 3,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks=[("res_x", 1)],
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
        base_channels: int = 128,
        norm_num_groups: int = 32,
        patch_size: Union[int, Tuple[int]] = 1,
        norm_layer: str = "group_norm",  # group_norm, pixel_norm
        latent_log_var: str = "per_channel",
+        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@ -63,6 +68,7 @@ class Encoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

        self.down_blocks = nn.ModuleList([])
@ -80,6 +86,7 @@ class Encoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@ -90,6 +97,7 @@ class Encoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = make_conv_nd(
@ -99,6 +107,7 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 1, 1),
                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = make_conv_nd(
@ -108,6 +117,7 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(1, 2, 2),
                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
                block = make_conv_nd(
@ -117,6 +127,7 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all_x_y":
                output_channel = block_params.get("multiplier", 2) * output_channel
@ -127,6 +138,34 @@ class Encoder(nn.Module):
                    kernel_size=3,
                    stride=(2, 2, 2),
                    causal=True,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_all_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(2, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_space_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(1, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "compress_time_res":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = SpaceToDepthDownsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    stride=(2, 1, 1),
+                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown block: {block_name}")
@ -150,10 +189,18 @@ class Encoder(nn.Module):
            conv_out_channels *= 2
        elif latent_log_var == "uniform":
            conv_out_channels += 1
+        elif latent_log_var == "constant":
+            conv_out_channels += 1
        elif latent_log_var != "none":
            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
        self.conv_out = make_conv_nd(
-            dims, output_channel, conv_out_channels, 3, padding=1, causal=True
+            dims,
+            output_channel,
+            conv_out_channels,
+            3,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

        self.gradient_checkpointing = False
@ -195,6 +242,15 @@ class Encoder(nn.Module):
                sample = torch.cat([sample, repeated_last_channel], dim=1)
            else:
                raise ValueError(f"Invalid input shape: {sample.shape}")
+        elif self.latent_log_var == "constant":
+            sample = sample[:, :-1, ...]
+            approx_ln_0 = (
+                -30
+            )  # this is the minimal clamp value in DiagonalGaussianDistribution objects
+            sample = torch.cat(
+                [sample, torch.ones_like(sample, device=sample.device) * approx_ln_0],
+                dim=1,
+            )

        return sample

@ -229,13 +285,15 @@ class Decoder(nn.Module):
        dims,
        in_channels: int = 3,
        out_channels: int = 3,
-        blocks=[("res_x", 1)],
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
        base_channels: int = 128,
        layers_per_block: int = 2,
        norm_num_groups: int = 32,
        patch_size: int = 1,
        norm_layer: str = "group_norm",
        causal: bool = True,
+        timestep_conditioning: bool = False,
+        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.patch_size = patch_size
@ -250,6 +308,8 @@ class Decoder(nn.Module):
            block_params = block_params if isinstance(block_params, dict) else {}
            if block_name == "res_x_y":
                output_channel = output_channel * block_params.get("multiplier", 2)
+            if block_name == "compress_all":
+                output_channel = output_channel * block_params.get("multiplier", 1)

        self.conv_in = make_conv_nd(
            dims,
@ -259,6 +319,7 @@ class Decoder(nn.Module):
            stride=1,
            padding=1,
            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

        self.up_blocks = nn.ModuleList([])
@ -276,6 +337,21 @@ class Decoder(nn.Module):
                    resnet_eps=1e-6,
                    resnet_groups=norm_num_groups,
                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    spatial_padding_mode=spatial_padding_mode,
+                )
+            elif block_name == "attn_res_x":
+                block = UNetMidBlock3D(
+                    dims=dims,
+                    in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
+                    resnet_groups=norm_num_groups,
+                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
+                    attention_head_dim=block_params["attention_head_dim"],
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "res_x_y":
                output_channel = output_channel // block_params.get("multiplier", 2)
@ -286,21 +362,33 @@ class Decoder(nn.Module):
                    eps=1e-6,
                    groups=norm_num_groups,
                    norm_layer=norm_layer,
+                    inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=False,
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_time":
                block = DepthToSpaceUpsample(
-                    dims=dims, in_channels=input_channel, stride=(2, 1, 1)
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 1, 1),
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_space":
                block = DepthToSpaceUpsample(
-                    dims=dims, in_channels=input_channel, stride=(1, 2, 2)
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(1, 2, 2),
+                    spatial_padding_mode=spatial_padding_mode,
                )
            elif block_name == "compress_all":
+                output_channel = output_channel // block_params.get("multiplier", 1)
                block = DepthToSpaceUpsample(
                    dims=dims,
                    in_channels=input_channel,
                    stride=(2, 2, 2),
                    residual=block_params.get("residual", False),
+                    out_channels_reduction_factor=block_params.get("multiplier", 1),
+                    spatial_padding_mode=spatial_padding_mode,
                )
            else:
                raise ValueError(f"unknown layer: {block_name}")
@ -318,32 +406,86 @@ class Decoder(nn.Module):

        self.conv_act = nn.SiLU()
        self.conv_out = make_conv_nd(
-            dims, output_channel, out_channels, 3, padding=1, causal=True
+            dims,
+            output_channel,
+            out_channels,
+            3,
+            padding=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

        self.gradient_checkpointing = False

+        self.timestep_conditioning = timestep_conditioning
+
+        if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(
+                torch.tensor(1000.0, dtype=torch.float32)
+            )
+            self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                output_channel * 2, 0, operations=ops,
+            )
+            self.last_scale_shift_table = nn.Parameter(torch.empty(2, output_channel))
+
    # def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
-    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
        r"""The forward method of the `Decoder` class."""
-        # assert target_shape is not None, "target_shape must be provided"
+        batch_size = sample.shape[0]

        sample = self.conv_in(sample, causal=self.causal)

-        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
-
        checkpoint_fn = (
            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
            if self.gradient_checkpointing and self.training
            else lambda x: x
        )

-        sample = sample.to(upscale_dtype)
+        scaled_timestep = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            scaled_timestep = timestep * self.timestep_scale_multiplier.to(dtype=sample.dtype, device=sample.device)

        for up_block in self.up_blocks:
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timestep=scaled_timestep
+                )
+            else:
                sample = checkpoint_fn(up_block)(sample, causal=self.causal)

        sample = self.conv_norm_out(sample)
+
+        if self.timestep_conditioning:
+            embedded_timestep = self.last_time_embedder(
+                timestep=scaled_timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=sample.shape[0],
+                hidden_dtype=sample.dtype,
+            )
+            embedded_timestep = embedded_timestep.view(
+                batch_size, embedded_timestep.shape[-1], 1, 1, 1
+            )
+            ada_values = self.last_scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=sample.device, dtype=sample.dtype) + embedded_timestep.reshape(
+                batch_size,
+                2,
+                -1,
+                embedded_timestep.shape[-3],
+                embedded_timestep.shape[-2],
+                embedded_timestep.shape[-1],
+            )
+            shift, scale = ada_values.unbind(dim=1)
+            sample = sample * (1 + scale) + shift
+
        sample = self.conv_act(sample)
        sample = self.conv_out(sample, causal=self.causal)

@ -363,6 +505,12 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
        resnet_groups (`int`, *optional*, defaults to 32):
            The number of groups to use in the group normalization layers of the resnet blocks.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        inject_noise (`bool`, *optional*, defaults to `False`):
+            Whether to inject noise into the hidden states.
+        timestep_conditioning (`bool`, *optional*, defaults to `False`):
+            Whether to condition the hidden states on the timestep.

    Returns:
        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
@ -379,12 +527,22 @@ class UNetMidBlock3D(nn.Module):
        resnet_eps: float = 1e-6,
        resnet_groups: int = 32,
        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        resnet_groups = (
            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
        )

+        self.timestep_conditioning = timestep_conditioning
+
+        if timestep_conditioning:
+            self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                in_channels * 4, 0, operations=ops,
+            )
+
        self.res_blocks = nn.ModuleList(
            [
                ResnetBlock3D(
@ -395,25 +553,105 @@ class UNetMidBlock3D(nn.Module):
                    groups=resnet_groups,
                    dropout=dropout,
                    norm_layer=norm_layer,
+                    inject_noise=inject_noise,
+                    timestep_conditioning=timestep_conditioning,
+                    spatial_padding_mode=spatial_padding_mode,
                )
                for _ in range(num_layers)
            ]
        )

    def forward(
-        self, hidden_states: torch.FloatTensor, causal: bool = True
+        self,
+        hidden_states: torch.FloatTensor,
+        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
+        timestep_embed = None
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            batch_size = hidden_states.shape[0]
+            timestep_embed = self.time_embedder(
+                timestep=timestep.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=batch_size,
+                hidden_dtype=hidden_states.dtype,
+            )
+            timestep_embed = timestep_embed.view(
+                batch_size, timestep_embed.shape[-1], 1, 1, 1
+            )
+
        for resnet in self.res_blocks:
-            hidden_states = resnet(hidden_states, causal=causal)
+            hidden_states = resnet(hidden_states, causal=causal, timestep=timestep_embed)

        return hidden_states


-class DepthToSpaceUpsample(nn.Module):
-    def __init__(self, dims, in_channels, stride, residual=False):
+class SpaceToDepthDownsample(nn.Module):
+    def __init__(self, dims, in_channels, out_channels, stride, spatial_padding_mode):
        super().__init__()
        self.stride = stride
-        self.out_channels = math.prod(stride) * in_channels
+        self.group_size = in_channels * math.prod(stride) // out_channels
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=out_channels // math.prod(stride),
+            kernel_size=3,
+            stride=1,
+            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
+        )
+
+    def forward(self, x, causal: bool = True):
+        if self.stride[0] == 2:
+            x = torch.cat(
+                [x[:, :, :1, :, :], x], dim=2
+            )  # duplicate first frames for padding
+
+        # skip connection
+        x_in = rearrange(
+            x,
+            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+        x_in = rearrange(x_in, "b (c g) d h w -> b c g d h w", g=self.group_size)
+        x_in = x_in.mean(dim=2)
+
+        # conv
+        x = self.conv(x, causal=causal)
+        x = rearrange(
+            x,
+            "b c (d p1) (h p2) (w p3) -> b (c p1 p2 p3) d h w",
+            p1=self.stride[0],
+            p2=self.stride[1],
+            p3=self.stride[2],
+        )
+
+        x = x + x_in
+
+        return x
+
+
+class DepthToSpaceUpsample(nn.Module):
+    def __init__(
+        self,
+        dims,
+        in_channels,
+        stride,
+        residual=False,
+        out_channels_reduction_factor=1,
+        spatial_padding_mode="zeros",
+    ):
+        super().__init__()
+        self.stride = stride
+        self.out_channels = (
+            math.prod(stride) * in_channels // out_channels_reduction_factor
+        )
        self.conv = make_conv_nd(
            dims=dims,
            in_channels=in_channels,
@ -421,10 +659,12 @@ class DepthToSpaceUpsample(nn.Module):
            kernel_size=3,
            stride=1,
            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )
        self.residual = residual
+        self.out_channels_reduction_factor = out_channels_reduction_factor

-    def forward(self, x, causal: bool = True):
+    def forward(self, x, causal: bool = True, timestep: Optional[torch.Tensor] = None):
        if self.residual:
            # Reshape and duplicate the input to match the output shape
            x_in = rearrange(
@ -434,7 +674,8 @@ class DepthToSpaceUpsample(nn.Module):
                p2=self.stride[1],
                p3=self.stride[2],
            )
-            x_in = x_in.repeat(1, math.prod(self.stride), 1, 1, 1)
+            num_repeat = math.prod(self.stride) // self.out_channels_reduction_factor
+            x_in = x_in.repeat(1, num_repeat, 1, 1, 1)
            if self.stride[0] == 2:
                x_in = x_in[:, :, 1:, :, :]
        x = self.conv(x, causal=causal)
@ -451,11 +692,10 @@ class DepthToSpaceUpsample(nn.Module):
            x = x + x_in
        return x

-
 class LayerNorm(nn.Module):
    def __init__(self, dim, eps, elementwise_affine=True) -> None:
        super().__init__()
-        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm = ops.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine)

    def forward(self, x):
        x = rearrange(x, "b c d h w -> b d h w c")
@ -486,11 +726,15 @@ class ResnetBlock3D(nn.Module):
        groups: int = 32,
        eps: float = 1e-6,
        norm_layer: str = "group_norm",
+        inject_noise: bool = False,
+        timestep_conditioning: bool = False,
+        spatial_padding_mode: str = "zeros",
    ):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels
+        self.inject_noise = inject_noise

        if norm_layer == "group_norm":
            self.norm1 = nn.GroupNorm(
@ -511,8 +755,12 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

+        if inject_noise:
+            self.per_channel_scale1 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+
        if norm_layer == "group_norm":
            self.norm2 = nn.GroupNorm(
                num_groups=groups, num_channels=out_channels, eps=eps, affine=True
@ -532,8 +780,12 @@ class ResnetBlock3D(nn.Module):
            stride=1,
            padding=1,
            causal=True,
+            spatial_padding_mode=spatial_padding_mode,
        )

+        if inject_noise:
+            self.per_channel_scale2 = nn.Parameter(torch.zeros((in_channels, 1, 1)))
+
        self.conv_shortcut = (
            make_linear_nd(
                dims=dims, in_channels=in_channels, out_channels=out_channels
@ -548,29 +800,84 @@ class ResnetBlock3D(nn.Module):
            else nn.Identity()
        )

+        self.timestep_conditioning = timestep_conditioning
+
+        if timestep_conditioning:
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(4, in_channels) / in_channels**0.5
+            )
+
+    def _feed_spatial_noise(
+        self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        spatial_shape = hidden_states.shape[-2:]
+        device = hidden_states.device
+        dtype = hidden_states.dtype
+
+        # similar to the "explicit noise inputs" method in style-gan
+        spatial_noise = torch.randn(spatial_shape, device=device, dtype=dtype)[None]
+        scaled_noise = (spatial_noise * per_channel_scale)[None, :, None, ...]
+        hidden_states = hidden_states + scaled_noise
+
+        return hidden_states
+
    def forward(
        self,
        input_tensor: torch.FloatTensor,
        causal: bool = True,
+        timestep: Optional[torch.Tensor] = None,
    ) -> torch.FloatTensor:
        hidden_states = input_tensor
+        batch_size = hidden_states.shape[0]

        hidden_states = self.norm1(hidden_states)
+        if self.timestep_conditioning:
+            assert (
+                timestep is not None
+            ), "should pass timestep with timestep_conditioning=True"
+            ada_values = self.scale_shift_table[
+                None, ..., None, None, None
+            ].to(device=hidden_states.device, dtype=hidden_states.dtype) + timestep.reshape(
+                batch_size,
+                4,
+                -1,
+                timestep.shape[-3],
+                timestep.shape[-2],
+                timestep.shape[-1],
+            )
+            shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+
+            hidden_states = hidden_states * (1 + scale1) + shift1

        hidden_states = self.non_linearity(hidden_states)

        hidden_states = self.conv1(hidden_states, causal=causal)

+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale1.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+
        hidden_states = self.norm2(hidden_states)

+        if self.timestep_conditioning:
+            hidden_states = hidden_states * (1 + scale2) + shift2
+
        hidden_states = self.non_linearity(hidden_states)

        hidden_states = self.dropout(hidden_states)

        hidden_states = self.conv2(hidden_states, causal=causal)

+        if self.inject_noise:
+            hidden_states = self._feed_spatial_noise(
+                hidden_states, self.per_channel_scale2.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            )
+
        input_tensor = self.norm3(input_tensor)

+        batch_size = input_tensor.shape[0]
+
        input_tensor = self.conv_shortcut(input_tensor)

        output_tensor = input_tensor + hidden_states
@ -634,8 +941,45 @@ class processor(nn.Module):
        return (x - self.get_buffer("mean-of-means").view(1, -1, 1, 1, 1).to(x)) / self.get_buffer("std-of-means").view(1, -1, 1, 1, 1).to(x)

 class VideoVAE(nn.Module):
-    def __init__(self):
+    def __init__(self, version=0, config=None):
        super().__init__()
+
+        if config is None:
+            config = self.guess_config(version)
+
+        self.timestep_conditioning = config.get("timestep_conditioning", False)
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get(
+            "latent_log_var", "per_channel" if double_z else "none"
+        )
+
+        self.encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("encoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+        )
+
+        self.decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("decoder_blocks", config.get("blocks"))),
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            causal=config.get("causal_decoder", False),
+            timestep_conditioning=self.timestep_conditioning,
+            spatial_padding_mode=config.get("spatial_padding_mode", "zeros"),
+        )
+
+        self.per_channel_statistics = processor()
+
+    def guess_config(self, version):
+        if version == 0:
            config = {
                "_class_name": "CausalVideoAutoencoder",
                "dims": 3,
@ -661,38 +1005,88 @@ class VideoVAE(nn.Module):
                "use_quant_conv": False,
                "causal_decoder": False,
            }
-
-        double_z = config.get("double_z", True)
-        latent_log_var = config.get(
-            "latent_log_var", "per_channel" if double_z else "none"
-        )
-
-        self.encoder = Encoder(
-            dims=config["dims"],
-            in_channels=config.get("in_channels", 3),
-            out_channels=config["latent_channels"],
-            blocks=config.get("encoder_blocks", config.get("blocks")),
-            patch_size=config.get("patch_size", 1),
-            latent_log_var=latent_log_var,
-            norm_layer=config.get("norm_layer", "group_norm"),
-        )
-
-        self.decoder = Decoder(
-            dims=config["dims"],
-            in_channels=config["latent_channels"],
-            out_channels=config.get("out_channels", 3),
-            blocks=config.get("decoder_blocks", config.get("blocks")),
-            patch_size=config.get("patch_size", 1),
-            norm_layer=config.get("norm_layer", "group_norm"),
-            causal=config.get("causal_decoder", False),
-        )
-
-        self.per_channel_statistics = processor()
+        elif version == 1:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "decoder_blocks": [
+                    ["res_x", {"num_layers": 5, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 6, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 7, "inject_noise": True}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 8, "inject_noise": False}]
+                ],
+                "encoder_blocks": [
+                    ["res_x", {"num_layers": 4}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x_y", 1],
+                    ["res_x", {"num_layers": 3}],
+                    ["compress_all", {}],
+                    ["res_x", {"num_layers": 3}],
+                    ["res_x", {"num_layers": 4}]
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+                "timestep_conditioning": True,
+            }
+        else:
+            config = {
+                "_class_name": "CausalVideoAutoencoder",
+                "dims": 3,
+                "in_channels": 3,
+                "out_channels": 3,
+                "latent_channels": 128,
+                "encoder_blocks": [
+                    ["res_x", {"num_layers": 4}],
+                    ["compress_space_res", {"multiplier": 2}],
+                    ["res_x", {"num_layers": 6}],
+                    ["compress_time_res", {"multiplier": 2}],
+                    ["res_x", {"num_layers": 6}],
+                    ["compress_all_res", {"multiplier": 2}],
+                    ["res_x", {"num_layers": 2}],
+                    ["compress_all_res", {"multiplier": 2}],
+                    ["res_x", {"num_layers": 2}]
+                ],
+                "decoder_blocks": [
+                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 5, "inject_noise": False}],
+                    ["compress_all", {"residual": True, "multiplier": 2}],
+                    ["res_x", {"num_layers": 5, "inject_noise": False}]
+                ],
+                "scaling_factor": 1.0,
+                "norm_layer": "pixel_norm",
+                "patch_size": 4,
+                "latent_log_var": "uniform",
+                "use_quant_conv": False,
+                "causal_decoder": False,
+                "timestep_conditioning": True
+            }
+        return config

    def encode(self, x):
+        frames_count = x.shape[2]
+        if ((frames_count - 1) % 8) != 0:
+            raise ValueError("Invalid number of frames: Encode input must have 1 + 8 * x frames (e.g., 1, 9, 17, ...). Please check your input.")
        means, logvar = torch.chunk(self.encoder(x), 2, dim=1)
        return self.per_channel_statistics.normalize(means)

-    def decode(self, x):
-        return self.decoder(self.per_channel_statistics.un_normalize(x))
+    def decode(self, x, timestep=0.05, noise_scale=0.025):
+        if self.timestep_conditioning: #TODO: seed
+            x = torch.randn_like(x) * noise_scale + (1.0 - noise_scale) * x
+        return self.decoder(self.per_channel_statistics.un_normalize(x), timestep=timestep)

--- a/comfy/ldm/lightricks/vae/conv_nd_factory.py
+++ b/comfy/ldm/lightricks/vae/conv_nd_factory.py
@ -1,6 +1,5 @@
 from typing import Tuple, Union

-import torch

 from .dual_conv3d import DualConv3d
 from .causal_conv3d import CausalConv3d
@ -18,7 +17,11 @@ def make_conv_nd(
    groups=1,
    bias=True,
    causal=False,
+    spatial_padding_mode="zeros",
+    temporal_padding_mode="zeros",
 ):
+    if not (spatial_padding_mode == temporal_padding_mode or causal):
+        raise NotImplementedError("spatial and temporal padding modes must be equal")
    if dims == 2:
        return ops.Conv2d(
            in_channels=in_channels,
@ -29,6 +32,7 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
+            padding_mode=spatial_padding_mode,
        )
    elif dims == 3:
        if causal:
@ -41,6 +45,7 @@ def make_conv_nd(
                dilation=dilation,
                groups=groups,
                bias=bias,
+                spatial_padding_mode=spatial_padding_mode,
            )
        return ops.Conv3d(
            in_channels=in_channels,
@ -51,6 +56,7 @@ def make_conv_nd(
            dilation=dilation,
            groups=groups,
            bias=bias,
+            padding_mode=spatial_padding_mode,
        )
    elif dims == (2, 1):
        return DualConv3d(
@ -60,6 +66,7 @@ def make_conv_nd(
            stride=stride,
            padding=padding,
            bias=bias,
+            padding_mode=spatial_padding_mode,
        )
    else:
        raise ValueError(f"unsupported dimensions: {dims}")
--- a/comfy/ldm/lightricks/vae/dual_conv3d.py
+++ b/comfy/ldm/lightricks/vae/dual_conv3d.py
@ -18,11 +18,13 @@ class DualConv3d(nn.Module):
        dilation: Union[int, Tuple[int, int, int]] = 1,
        groups=1,
        bias=True,
+        padding_mode="zeros",
    ):
        super(DualConv3d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
+        self.padding_mode = padding_mode
        # Ensure kernel_size, stride, padding, and dilation are tuples of length 3
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size, kernel_size)
@ -108,6 +110,7 @@ class DualConv3d(nn.Module):
            self.padding1,
            self.dilation1,
            self.groups,
+            padding_mode=self.padding_mode,
        )

        if skip_time_conv:
@ -122,6 +125,7 @@ class DualConv3d(nn.Module):
            self.padding2,
            self.dilation2,
            self.groups,
+            padding_mode=self.padding_mode,
        )

        return x
@ -137,7 +141,16 @@ class DualConv3d(nn.Module):
        stride1 = (self.stride1[1], self.stride1[2])
        padding1 = (self.padding1[1], self.padding1[2])
        dilation1 = (self.dilation1[1], self.dilation1[2])
-        x = F.conv2d(x, weight1, self.bias1, stride1, padding1, dilation1, self.groups)
+        x = F.conv2d(
+            x,
+            weight1,
+            self.bias1,
+            stride1,
+            padding1,
+            dilation1,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )

        _, _, h, w = x.shape

@ -154,7 +167,16 @@ class DualConv3d(nn.Module):
        stride2 = self.stride2[0]
        padding2 = self.padding2[0]
        dilation2 = self.dilation2[0]
-        x = F.conv1d(x, weight2, self.bias2, stride2, padding2, dilation2, self.groups)
+        x = F.conv1d(
+            x,
+            weight2,
+            self.bias2,
+            stride2,
+            padding2,
+            dilation2,
+            self.groups,
+            padding_mode=self.padding_mode,
+        )
        x = rearrange(x, "(b h w) c d -> b c d h w", b=b, h=h, w=w)

        return x
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@ -0,0 +1,622 @@
+# Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
+from __future__ import annotations
+
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import comfy.ldm.common_dit
+
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, RMSNorm
+from comfy.ldm.modules.attention import optimized_attention_masked
+from comfy.ldm.flux.layers import EmbedND
+
+
+def modulate(x, scale):
+    return x * (1 + scale.unsqueeze(1))
+
+#############################################################################
+#                               Core NextDiT Model                              #
+#############################################################################
+
+
+class JointAttention(nn.Module):
+    """Multi-head attention module."""
+
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: Optional[int],
+        qk_norm: bool,
+        operation_settings={},
+    ):
+        """
+        Initialize the Attention module.
+
+        Args:
+            dim (int): Number of input dimensions.
+            n_heads (int): Number of heads.
+            n_kv_heads (Optional[int]): Number of kv heads, if using GQA.
+
+        """
+        super().__init__()
+        self.n_kv_heads = n_heads if n_kv_heads is None else n_kv_heads
+        self.n_local_heads = n_heads
+        self.n_local_kv_heads = self.n_kv_heads
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = dim // n_heads
+
+        self.qkv = operation_settings.get("operations").Linear(
+            dim,
+            (n_heads + self.n_kv_heads + self.n_kv_heads) * self.head_dim,
+            bias=False,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+        self.out = operation_settings.get("operations").Linear(
+            n_heads * self.head_dim,
+            dim,
+            bias=False,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+
+        if qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, elementwise_affine=True, **operation_settings)
+            self.k_norm = RMSNorm(self.head_dim, elementwise_affine=True, **operation_settings)
+        else:
+            self.q_norm = self.k_norm = nn.Identity()
+
+    @staticmethod
+    def apply_rotary_emb(
+        x_in: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Apply rotary embeddings to input tensors using the given frequency
+        tensor.
+
+        This function applies rotary embeddings to the given query 'xq' and
+        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
+        input tensors are reshaped as complex numbers, and the frequency tensor
+        is reshaped for broadcasting compatibility. The resulting tensors
+        contain rotary embeddings and are returned as real tensors.
+
+        Args:
+            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
+                exponentials.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
+                and key tensor with rotary embeddings.
+        """
+
+        t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2)
+        t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
+        return t_out.reshape(*x_in.shape)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+
+        Args:
+            x:
+            x_mask:
+            freqs_cis:
+
+        Returns:
+
+        """
+        bsz, seqlen, _ = x.shape
+
+        xq, xk, xv = torch.split(
+            self.qkv(x),
+            [
+                self.n_local_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+                self.n_local_kv_heads * self.head_dim,
+            ],
+            dim=-1,
+        )
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+
+        xq = self.q_norm(xq)
+        xk = self.k_norm(xk)
+
+        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
+        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
+
+        n_rep = self.n_local_heads // self.n_local_kv_heads
+        if n_rep >= 1:
+            xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+            xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
+        output = optimized_attention_masked(xq.movedim(1, 2), xk.movedim(1, 2), xv.movedim(1, 2), self.n_local_heads, x_mask, skip_reshape=True)
+
+        return self.out(output)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        operation_settings={},
+    ):
+        """
+        Initialize the FeedForward module.
+
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple
+                of this value.
+            ffn_dim_multiplier (float, optional): Custom multiplier for hidden
+                dimension. Defaults to None.
+
+        """
+        super().__init__()
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = operation_settings.get("operations").Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+        self.w2 = operation_settings.get("operations").Linear(
+            hidden_dim,
+            dim,
+            bias=False,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+        self.w3 = operation_settings.get("operations").Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+
+    # @torch.compile
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+
+
+class JointTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+        operation_settings={},
+    ) -> None:
+        """
+        Initialize a TransformerBlock.
+
+        Args:
+            layer_id (int): Identifier for the layer.
+            dim (int): Embedding dimension of the input features.
+            n_heads (int): Number of attention heads.
+            n_kv_heads (Optional[int]): Number of attention heads in key and
+                value features (if using GQA), or set to None for the same as
+                query.
+            multiple_of (int):
+            ffn_dim_multiplier (float):
+            norm_eps (float):
+
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings)
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+            operation_settings=operation_settings,
+        )
+        self.layer_id = layer_id
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
+
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
+
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                operation_settings.get("operations").Linear(
+                    min(dim, 1024),
+                    4 * dim,
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor]=None,
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+
+        Returns:
+            torch.Tensor: Output tensor after applying attention and
+                feedforward layers.
+
+        """
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
+
+            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
+                self.attention(
+                    modulate(self.attention_norm1(x), scale_msa),
+                    x_mask,
+                    freqs_cis,
+                )
+            )
+            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
+                self.feed_forward(
+                    modulate(self.ffn_norm1(x), scale_mlp),
+                )
+            )
+        else:
+            assert adaln_input is None
+            x = x + self.attention_norm2(
+                self.attention(
+                    self.attention_norm1(x),
+                    x_mask,
+                    freqs_cis,
+                )
+            )
+            x = x + self.ffn_norm2(
+                self.feed_forward(
+                    self.ffn_norm1(x),
+                )
+            )
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of NextDiT.
+    """
+
+    def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}):
+        super().__init__()
+        self.norm_final = operation_settings.get("operations").LayerNorm(
+            hidden_size,
+            elementwise_affine=False,
+            eps=1e-6,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+        self.linear = operation_settings.get("operations").Linear(
+            hidden_size,
+            patch_size * patch_size * out_channels,
+            bias=True,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operation_settings.get("operations").Linear(
+                min(hidden_size, 1024),
+                hidden_size,
+                bias=True,
+                device=operation_settings.get("device"),
+                dtype=operation_settings.get("dtype"),
+            ),
+        )
+
+    def forward(self, x, c):
+        scale = self.adaLN_modulation(c)
+        x = modulate(self.norm_final(x), scale)
+        x = self.linear(x)
+        return x
+
+
+class NextDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 4,
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_refiner_layers: int = 2,
+        n_heads: int = 32,
+        n_kv_heads: Optional[int] = None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        qk_norm: bool = False,
+        cap_feat_dim: int = 5120,
+        axes_dims: List[int] = (16, 56, 56),
+        axes_lens: List[int] = (1, 512, 512),
+        image_model=None,
+        device=None,
+        dtype=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.dtype = dtype
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+
+        self.x_embedder = operation_settings.get("operations").Linear(
+            in_features=patch_size * patch_size * in_channels,
+            out_features=dim,
+            bias=True,
+            device=operation_settings.get("device"),
+            dtype=operation_settings.get("dtype"),
+        )
+
+        self.noise_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=True,
+                    operation_settings=operation_settings,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+        self.context_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=False,
+                    operation_settings=operation_settings,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+
+        self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
+        self.cap_embedder = nn.Sequential(
+            RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, **operation_settings),
+            operation_settings.get("operations").Linear(
+                cap_feat_dim,
+                dim,
+                bias=True,
+                device=operation_settings.get("device"),
+                dtype=operation_settings.get("dtype"),
+            ),
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    operation_settings=operation_settings,
+                )
+                for layer_id in range(n_layers)
+            ]
+        )
+        self.norm_final = RMSNorm(dim, eps=norm_eps, elementwise_affine=True, **operation_settings)
+        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)
+
+        assert (dim // n_heads) == sum(axes_dims)
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims)
+        self.dim = dim
+        self.n_heads = n_heads
+
+    def unpatchify(
+        self, x: torch.Tensor, img_size: List[Tuple[int, int]], cap_size: List[int], return_tensor=False
+    ) -> List[torch.Tensor]:
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        pH = pW = self.patch_size
+        imgs = []
+        for i in range(x.size(0)):
+            H, W = img_size[i]
+            begin = cap_size[i]
+            end = begin + (H // pH) * (W // pW)
+            imgs.append(
+                x[i][begin:end]
+                .view(H // pH, W // pW, pH, pW, self.out_channels)
+                .permute(4, 0, 2, 1, 3)
+                .flatten(3, 4)
+                .flatten(1, 2)
+            )
+
+        if return_tensor:
+            imgs = torch.stack(imgs, dim=0)
+        return imgs
+
+    def patchify_and_embed(
+        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
+        bsz = len(x)
+        pH = pW = self.patch_size
+        device = x[0].device
+        dtype = x[0].dtype
+
+        if cap_mask is not None:
+            l_effective_cap_len = cap_mask.sum(dim=1).tolist()
+        else:
+            l_effective_cap_len = [num_tokens] * bsz
+
+        if cap_mask is not None and not torch.is_floating_point(cap_mask):
+            cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max
+
+        img_sizes = [(img.size(1), img.size(2)) for img in x]
+        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
+
+        max_seq_len = max(
+            (cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len))
+        )
+        max_cap_len = max(l_effective_cap_len)
+        max_img_len = max(l_effective_img_len)
+
+        position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device)
+
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            H, W = img_sizes[i]
+            H_tokens, W_tokens = H // pH, W // pW
+            assert H_tokens * W_tokens == img_len
+
+            position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device)
+            position_ids[i, cap_len:cap_len+img_len, 0] = cap_len
+            row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten()
+            col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten()
+            position_ids[i, cap_len:cap_len+img_len, 1] = row_ids
+            position_ids[i, cap_len:cap_len+img_len, 2] = col_ids
+
+        freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype)
+
+        # build freqs_cis for cap and image individually
+        cap_freqs_cis_shape = list(freqs_cis.shape)
+        # cap_freqs_cis_shape[1] = max_cap_len
+        cap_freqs_cis_shape[1] = cap_feats.shape[1]
+        cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
+
+        img_freqs_cis_shape = list(freqs_cis.shape)
+        img_freqs_cis_shape[1] = max_img_len
+        img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
+
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+            cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
+            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len]
+
+        # refine context
+        for layer in self.context_refiner:
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis)
+
+        # refine image
+        flat_x = []
+        for i in range(bsz):
+            img = x[i]
+            C, H, W = img.size()
+            img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1)
+            flat_x.append(img)
+        x = flat_x
+        padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype)
+        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device)
+        for i in range(bsz):
+            padded_img_embed[i, :l_effective_img_len[i]] = x[i]
+            padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max
+
+        padded_img_embed = self.x_embedder(padded_img_embed)
+        padded_img_mask = padded_img_mask.unsqueeze(1)
+        for layer in self.noise_refiner:
+            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t)
+
+        if cap_mask is not None:
+            mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
+            mask[:, :max_cap_len] = cap_mask[:, :max_cap_len]
+        else:
+            mask = None
+
+        padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype)
+        for i in range(bsz):
+            cap_len = l_effective_cap_len[i]
+            img_len = l_effective_img_len[i]
+
+            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
+            padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len]
+
+        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
+
+    # def forward(self, x, t, cap_feats, cap_mask):
+    def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
+        t = 1.0 - timesteps
+        cap_feats = context
+        cap_mask = attention_mask
+        bs, c, h, w = x.shape
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        """
+        Forward pass of NextDiT.
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of text tokens/features
+        """
+
+        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
+        adaln_input = t
+
+        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
+
+        x_is_tensor = isinstance(x, torch.Tensor)
+        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens)
+        freqs_cis = freqs_cis.to(x.device)
+
+        for layer in self.layers:
+            x = layer(x, mask, freqs_cis, adaln_input)
+
+        x = self.final_layer(x, adaln_input)
+        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
+
+        return -x
+
--- a/comfy/ldm/models/autoencoder.py
+++ b/comfy/ldm/models/autoencoder.py
@ -1,10 +1,12 @@
+import logging
+import math
 import torch
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Tuple, Union

 from comfy.ldm.modules.distributions.distributions import DiagonalGaussianDistribution

-from comfy.ldm.util import instantiate_from_config
+from comfy.ldm.util import get_obj_from_str, instantiate_from_config
 from comfy.ldm.modules.ema import LitEma
 import comfy.ops

@ -52,7 +54,7 @@ class AbstractAutoencoder(torch.nn.Module):

        if self.use_ema:
            self.model_ema = LitEma(self, decay=ema_decay)
-            logpy.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+            logging.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")

    def get_input(self, batch) -> Any:
        raise NotImplementedError()
@ -68,14 +70,14 @@ class AbstractAutoencoder(torch.nn.Module):
            self.model_ema.store(self.parameters())
            self.model_ema.copy_to(self)
            if context is not None:
-                logpy.info(f"{context}: Switched to EMA weights")
+                logging.info(f"{context}: Switched to EMA weights")
        try:
            yield None
        finally:
            if self.use_ema:
                self.model_ema.restore(self.parameters())
                if context is not None:
-                    logpy.info(f"{context}: Restored training weights")
+                    logging.info(f"{context}: Restored training weights")

    def encode(self, *args, **kwargs) -> torch.Tensor:
        raise NotImplementedError("encode()-method of abstract base class called")
@ -84,7 +86,7 @@ class AbstractAutoencoder(torch.nn.Module):
        raise NotImplementedError("decode()-method of abstract base class called")

    def instantiate_optimizer_from_config(self, params, lr, cfg):
-        logpy.info(f"loading >>> {cfg['target']} <<< optimizer from config")
+        logging.info(f"loading >>> {cfg['target']} <<< optimizer from config")
        return get_obj_from_str(cfg["target"])(
            params, lr=lr, **cfg.get("params", dict())
        )
@ -112,7 +114,7 @@ class AutoencodingEngine(AbstractAutoencoder):

        self.encoder: torch.nn.Module = instantiate_from_config(encoder_config)
        self.decoder: torch.nn.Module = instantiate_from_config(decoder_config)
-        self.regularization: AbstractRegularizer = instantiate_from_config(
+        self.regularization = instantiate_from_config(
            regularizer_config
        )

@ -160,12 +162,19 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
            },
            **kwargs,
        )
-        self.quant_conv = comfy.ops.disable_weight_init.Conv2d(
+
+        if ddconfig.get("conv3d", False):
+            conv_op = comfy.ops.disable_weight_init.Conv3d
+        else:
+            conv_op = comfy.ops.disable_weight_init.Conv2d
+
+        self.quant_conv = conv_op(
            (1 + ddconfig["double_z"]) * ddconfig["z_channels"],
            (1 + ddconfig["double_z"]) * embed_dim,
            1,
        )
-        self.post_quant_conv = comfy.ops.disable_weight_init.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+
+        self.post_quant_conv = conv_op(embed_dim, ddconfig["z_channels"], 1)
        self.embed_dim = embed_dim

    def get_autoencoder_params(self) -> list:
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -1,4 +1,6 @@
 import math
+import sys
+
 import torch
 import torch.nn.functional as F
 from torch import nn, einsum
@ -15,44 +17,44 @@ if model_management.xformers_enabled():
    import xformers
    import xformers.ops

+if model_management.sage_attention_enabled():
+    try:
+        from sageattention import sageattn
+    except ModuleNotFoundError:
+        logging.error(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
+        exit(-1)
+
+if model_management.flash_attention_enabled():
+    try:
+        from flash_attn import flash_attn_func
+    except ModuleNotFoundError:
+        logging.error(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
+        exit(-1)
+
 from comfy.cli_args import args
 import comfy.ops
 ops = comfy.ops.disable_weight_init

 FORCE_UPCAST_ATTENTION_DTYPE = model_management.force_upcast_attention_dtype()

-def get_attn_precision(attn_precision):
+def get_attn_precision(attn_precision, current_dtype):
    if args.dont_upcast_attention:
        return None
-    if FORCE_UPCAST_ATTENTION_DTYPE is not None:
-        return FORCE_UPCAST_ATTENTION_DTYPE
+
+    if FORCE_UPCAST_ATTENTION_DTYPE is not None and current_dtype in FORCE_UPCAST_ATTENTION_DTYPE:
+        return FORCE_UPCAST_ATTENTION_DTYPE[current_dtype]
    return attn_precision

 def exists(val):
    return val is not None


-def uniq(arr):
-    return{el: True for el in arr}.keys()
-
-
 def default(val, d):
    if exists(val):
        return val
    return d


-def max_neg_value(t):
-    return -torch.finfo(t.dtype).max
-
-
-def init_(tensor):
-    dim = tensor.shape[-1]
-    std = 1 / math.sqrt(dim)
-    tensor.uniform_(-std, std)
-    return tensor
-
-
 # feedforward
 class GEGLU(nn.Module):
    def __init__(self, dim_in, dim_out, dtype=None, device=None, operations=ops):
@ -86,8 +88,8 @@ class FeedForward(nn.Module):
 def Normalize(in_channels, dtype=None, device=None):
    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)

-def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
-    attn_precision = get_attn_precision(attn_precision)
+def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    attn_precision = get_attn_precision(attn_precision, q.dtype)

    if skip_reshape:
        b, _, _, dim_head = q.shape
@ -139,6 +141,13 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    sim = sim.softmax(dim=-1)

    out = einsum('b i j, b j d -> b i d', sim.to(v.dtype), v)
+
+    if skip_output_reshape:
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, heads, -1, dim_head)
+        )
+    else:
        out = (
            out.unsqueeze(0)
            .reshape(b, heads, -1, dim_head)
@ -148,8 +157,8 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
    return out


-def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False):
-    attn_precision = get_attn_precision(attn_precision)
+def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    attn_precision = get_attn_precision(attn_precision, query.dtype)

    if skip_reshape:
        b, _, _, dim_head = query.shape
@ -157,8 +166,6 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
        b, _, dim_head = query.shape
        dim_head //= heads

-    scale = dim_head ** -0.5
-
    if skip_reshape:
        query = query.reshape(b * heads, -1, dim_head)
        value = value.reshape(b * heads, -1, dim_head)
@ -177,9 +184,8 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
        bytes_per_token = torch.finfo(query.dtype).bits//8
    batch_x_heads, q_tokens, _ = query.shape
    _, _, k_tokens = key.shape
-    qk_matmul_size_bytes = batch_x_heads * bytes_per_token * q_tokens * k_tokens

-    mem_free_total, mem_free_torch = model_management.get_free_memory(query.device, True)
+    mem_free_total, _ = model_management.get_free_memory(query.device, True)

    kv_chunk_size_min = None
    kv_chunk_size = None
@ -215,12 +221,14 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
    )

    hidden_states = hidden_states.to(dtype)
-
+    if skip_output_reshape:
+        hidden_states = hidden_states.unflatten(0, (-1, heads))
+    else:
        hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1,2).flatten(start_dim=2)
    return hidden_states

-def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
-    attn_precision = get_attn_precision(attn_precision)
+def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    attn_precision = get_attn_precision(attn_precision, q.dtype)

    if skip_reshape:
        b, _, _, dim_head = q.shape
@ -230,7 +238,6 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape

    scale = dim_head ** -0.5

-    h = heads
    if skip_reshape:
         q, k, v = map(
            lambda t: t.reshape(b * heads, -1, dim_head),
@ -327,6 +334,12 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape

    del q, k, v

+    if skip_output_reshape:
+        r1 = (
+            r1.unsqueeze(0)
+            .reshape(b, heads, -1, dim_head)
+        )
+    else:
        r1 = (
            r1.unsqueeze(0)
            .reshape(b, heads, -1, dim_head)
@ -343,13 +356,10 @@ try:
 except:
    pass

-def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
-    if skip_reshape:
-        b, _, _, dim_head = q.shape
-    else:
-        b, _, dim_head = q.shape
-        dim_head //= heads
-
+def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    b = q.shape[0]
+    dim_head = q.shape[-1]
+    # check to make sure xformers isn't broken
    disabled_xformers = False

    if BROKEN_XFORMERS:
@ -364,31 +374,43 @@ def attention_xformers(q, k, v, heads, mask=None, attn_precision=None, skip_resh
        return attention_pytorch(q, k, v, heads, mask, skip_reshape=skip_reshape)

    if skip_reshape:
+        # b h k d -> b k h d
        q, k, v = map(
-            lambda t: t.reshape(b * heads, -1, dim_head),
+            lambda t: t.permute(0, 2, 1, 3),
            (q, k, v),
        )
+    # actually do the reshaping
    else:
+        dim_head //= heads
        q, k, v = map(
            lambda t: t.reshape(b, -1, heads, dim_head),
            (q, k, v),
        )

    if mask is not None:
+        # add a singleton batch dimension
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a singleton heads dimension
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+        # pad to a multiple of 8
        pad = 8 - mask.shape[-1] % 8
-        mask_out = torch.empty([q.shape[0], q.shape[2], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device)
+        # the xformers docs says that it's allowed to have a mask of shape (1, Nq, Nk)
+        # but when using separated heads, the shape has to be (B, H, Nq, Nk)
+        # in flux, this matrix ends up being over 1GB
+        # here, we create a mask with the same batch/head size as the input mask (potentially singleton or full)
+        mask_out = torch.empty([mask.shape[0], mask.shape[1], q.shape[1], mask.shape[-1] + pad], dtype=q.dtype, device=q.device)
+
        mask_out[..., :mask.shape[-1]] = mask
+        # doesn't this remove the padding again??
        mask = mask_out[..., :mask.shape[-1]]
+        mask = mask.expand(b, heads, -1, -1)

    out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask)

-    if skip_reshape:
-        out = (
-            out.unsqueeze(0)
-            .reshape(b, heads, -1, dim_head)
-            .permute(0, 2, 1, 3)
-            .reshape(b, -1, heads * dim_head)
-        )
+    if skip_output_reshape:
+        out = out.permute(0, 2, 1, 3)
    else:
        out = (
            out.reshape(b, -1, heads * dim_head)
@ -403,7 +425,7 @@ else:
    SDP_BATCH_LIMIT = 2**31


-def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False):
+def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
    if skip_reshape:
        b, _, _, dim_head = q.shape
    else:
@ -414,32 +436,160 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
            (q, k, v),
        )

-    if SDP_BATCH_LIMIT >= q.shape[0]:
+    if mask is not None:
+        # add a batch dimension if there isn't already one
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a heads dimension if there isn't already one
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+
+    if SDP_BATCH_LIMIT >= b:
        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+        if not skip_output_reshape:
            out = (
                out.transpose(1, 2).reshape(b, -1, heads * dim_head)
            )
    else:
-        out = torch.empty((q.shape[0], q.shape[2], heads * dim_head), dtype=q.dtype, layout=q.layout, device=q.device)
-        for i in range(0, q.shape[0], SDP_BATCH_LIMIT):
-            out[i : i + SDP_BATCH_LIMIT] = torch.nn.functional.scaled_dot_product_attention(q[i : i + SDP_BATCH_LIMIT], k[i : i + SDP_BATCH_LIMIT], v[i : i + SDP_BATCH_LIMIT], attn_mask=mask, dropout_p=0.0, is_causal=False).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
+        out = torch.empty((b, q.shape[2], heads * dim_head), dtype=q.dtype, layout=q.layout, device=q.device)
+        for i in range(0, b, SDP_BATCH_LIMIT):
+            m = mask
+            if mask is not None:
+                if mask.shape[0] > 1:
+                    m = mask[i : i + SDP_BATCH_LIMIT]
+
+            out[i : i + SDP_BATCH_LIMIT] = torch.nn.functional.scaled_dot_product_attention(
+                q[i : i + SDP_BATCH_LIMIT],
+                k[i : i + SDP_BATCH_LIMIT],
+                v[i : i + SDP_BATCH_LIMIT],
+                attn_mask=m,
+                dropout_p=0.0, is_causal=False
+            ).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
+    return out
+
+
+def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    if skip_reshape:
+        b, _, _, dim_head = q.shape
+        tensor_layout = "HND"
+    else:
+        b, _, dim_head = q.shape
+        dim_head //= heads
+        q, k, v = map(
+            lambda t: t.view(b, -1, heads, dim_head),
+            (q, k, v),
+        )
+        tensor_layout = "NHD"
+
+    if mask is not None:
+        # add a batch dimension if there isn't already one
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a heads dimension if there isn't already one
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+
+    try:
+        out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
+    except Exception as e:
+        logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
+        if tensor_layout == "NHD":
+            q, k, v = map(
+                lambda t: t.transpose(1, 2),
+                (q, k, v),
+            )
+        return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=True, skip_output_reshape=skip_output_reshape)
+
+    if tensor_layout == "HND":
+        if not skip_output_reshape:
+            out = (
+                out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+            )
+    else:
+        if skip_output_reshape:
+            out = out.transpose(1, 2)
+        else:
+            out = out.reshape(b, -1, heads * dim_head)
+    return out
+
+
+try:
+    @torch.library.custom_op("flash_attention::flash_attn", mutates_args=())
+    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
+        return flash_attn_func(q, k, v, dropout_p=dropout_p, causal=causal)
+
+
+    @flash_attn_wrapper.register_fake
+    def flash_attn_fake(q, k, v, dropout_p=0.0, causal=False):
+        # Output shape is the same as q
+        return q.new_empty(q.shape)
+except AttributeError as error:
+    FLASH_ATTN_ERROR = error
+
+    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
+        assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}"
+
+
+def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    if skip_reshape:
+        b, _, _, dim_head = q.shape
+    else:
+        b, _, dim_head = q.shape
+        dim_head //= heads
+        q, k, v = map(
+            lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
+            (q, k, v),
+        )
+
+    if mask is not None:
+        # add a batch dimension if there isn't already one
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a heads dimension if there isn't already one
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+
+    try:
+        assert mask is None
+        out = flash_attn_wrapper(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            dropout_p=0.0,
+            causal=False,
+        ).transpose(1, 2)
+    except Exception as e:
+        logging.warning(f"Flash Attention failed, using default SDPA: {e}")
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+    if not skip_output_reshape:
+        out = (
+            out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+        )
    return out


 optimized_attention = attention_basic

-if model_management.xformers_enabled():
-    logging.info("Using xformers cross attention")
+if model_management.sage_attention_enabled():
+    logging.info("Using sage attention")
+    optimized_attention = attention_sage
+elif model_management.xformers_enabled():
+    logging.info("Using xformers attention")
    optimized_attention = attention_xformers
+elif model_management.flash_attention_enabled():
+    logging.info("Using Flash Attention")
+    optimized_attention = attention_flash
 elif model_management.pytorch_attention_enabled():
-    logging.info("Using pytorch cross attention")
+    logging.info("Using pytorch attention")
    optimized_attention = attention_pytorch
 else:
    if args.use_split_cross_attention:
-        logging.info("Using split optimization for cross attention")
+        logging.info("Using split optimization for attention")
        optimized_attention = attention_split
    else:
-        logging.info("Using sub quadratic optimization for cross attention, if you have memory or speed issues try using: --use-split-cross-attention")
+        logging.info("Using sub quadratic optimization for attention, if you have memory or speed issues try using: --use-split-cross-attention")
        optimized_attention = attention_sub_quad

 optimized_attention_masked = optimized_attention
@ -697,6 +847,7 @@ class SpatialTransformer(nn.Module):
        if not isinstance(context, list):
            context = [context] * len(self.transformer_blocks)
        b, c, h, w = x.shape
+        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        x = self.norm(x)
        if not self.use_linear:
@ -812,6 +963,7 @@ class SpatialVideoTransformer(SpatialTransformer):
        transformer_options={}
    ) -> torch.Tensor:
        _, _, h, w = x.shape
+        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        spatial_context = None
        if exists(context):
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@ -1,5 +1,4 @@
-import logging
-import math
+from functools import partial
 from typing import Dict, Optional, List

 import numpy as np
@ -72,45 +71,33 @@ class PatchEmbed(nn.Module):
            strict_img_size: bool = True,
            dynamic_img_pad: bool = True,
            padding_mode='circular',
+            conv3d=False,
            dtype=None,
            device=None,
            operations=None,
    ):
        super().__init__()
+        try:
+            len(patch_size)
+            self.patch_size = patch_size
+        except:
+            if conv3d:
+                self.patch_size = (patch_size, patch_size, patch_size)
+            else:
                self.patch_size = (patch_size, patch_size)
        self.padding_mode = padding_mode
-        if img_size is not None:
-            self.img_size = (img_size, img_size)
-            self.grid_size = tuple([s // p for s, p in zip(self.img_size, self.patch_size)])
-            self.num_patches = self.grid_size[0] * self.grid_size[1]
-        else:
-            self.img_size = None
-            self.grid_size = None
-            self.num_patches = None

        # flatten spatial dim and transpose to channels last, kept for bwd compat
        self.flatten = flatten
        self.strict_img_size = strict_img_size
        self.dynamic_img_pad = dynamic_img_pad
-
+        if conv3d:
+            self.proj = operations.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, dtype=dtype, device=device)
+        else:
            self.proj = operations.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias, dtype=dtype, device=device)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
-        # B, C, H, W = x.shape
-        # if self.img_size is not None:
-        #     if self.strict_img_size:
-        #         _assert(H == self.img_size[0], f"Input height ({H}) doesn't match model ({self.img_size[0]}).")
-        #         _assert(W == self.img_size[1], f"Input width ({W}) doesn't match model ({self.img_size[1]}).")
-        #     elif not self.dynamic_img_pad:
-        #         _assert(
-        #             H % self.patch_size[0] == 0,
-        #             f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
-        #         )
-        #         _assert(
-        #             W % self.patch_size[1] == 0,
-        #             f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
-        #         )
        if self.dynamic_img_pad:
            x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode=self.padding_mode)
        x = self.proj(x)
@ -334,7 +321,7 @@ class SelfAttention(nn.Module):

 class RMSNorm(torch.nn.Module):
    def __init__(
-        self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6, device=None, dtype=None
+        self, dim: int, elementwise_affine: bool = False, eps: float = 1e-6, device=None, dtype=None, **kwargs
    ):
        """
        Initialize the RMSNorm normalization layer.
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@ -3,7 +3,6 @@ import math
 import torch
 import torch.nn as nn
 import numpy as np
-from typing import Optional, Any
 import logging

 from comfy import model_management
@ -44,51 +43,100 @@ def Normalize(in_channels, num_groups=32):
    return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)


+class VideoConv3d(nn.Module):
+    def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs):
+        super().__init__()
+
+        self.padding_mode = padding_mode
+        if padding != 0:
+            padding = (padding, padding, padding, padding, kernel_size - 1, 0)
+        else:
+            kwargs["padding"] = padding
+
+        self.padding = padding
+        self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        if self.padding != 0:
+            x = torch.nn.functional.pad(x, self.padding, mode=self.padding_mode)
+        return self.conv(x)
+
+def interpolate_up(x, scale_factor):
+    try:
+        return torch.nn.functional.interpolate(x, scale_factor=scale_factor, mode="nearest")
+    except: #operation not implemented for bf16
+        orig_shape = list(x.shape)
+        out_shape = orig_shape[:2]
+        for i in range(len(orig_shape) - 2):
+            out_shape.append(round(orig_shape[i + 2] * scale_factor[i]))
+        out = torch.empty(out_shape, dtype=x.dtype, layout=x.layout, device=x.device)
+        split = 8
+        l = out.shape[1] // split
+        for i in range(0, out.shape[1], l):
+            out[:,i:i+l] = torch.nn.functional.interpolate(x[:,i:i+l].to(torch.float32), scale_factor=scale_factor, mode="nearest").to(x.dtype)
+        return out
+
 class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
+    def __init__(self, in_channels, with_conv, conv_op=ops.Conv2d, scale_factor=2.0):
        super().__init__()
        self.with_conv = with_conv
+        self.scale_factor = scale_factor
+
        if self.with_conv:
-            self.conv = ops.Conv2d(in_channels,
+            self.conv = conv_op(in_channels,
                                        in_channels,
                                        kernel_size=3,
                                        stride=1,
                                        padding=1)

    def forward(self, x):
-        try:
-            x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        except: #operation not implemented for bf16
-            b, c, h, w = x.shape
-            out = torch.empty((b, c, h*2, w*2), dtype=x.dtype, layout=x.layout, device=x.device)
-            split = 8
-            l = out.shape[1] // split
-            for i in range(0, out.shape[1], l):
-                out[:,i:i+l] = torch.nn.functional.interpolate(x[:,i:i+l].to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype)
-            del x
-            x = out
+        scale_factor = self.scale_factor
+        if isinstance(scale_factor, (int, float)):
+            scale_factor = (scale_factor,) * (x.ndim - 2)

+        if x.ndim == 5 and scale_factor[0] > 1.0:
+            t = x.shape[2]
+            if t > 1:
+                a, b = x.split((1, t - 1), dim=2)
+                del x
+                b = interpolate_up(b, scale_factor)
+            else:
+                a = x
+
+            a = interpolate_up(a.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2)
+            if t > 1:
+                x = torch.cat((a, b), dim=2)
+            else:
+                x = a
+        else:
+            x = interpolate_up(x, scale_factor)
        if self.with_conv:
            x = self.conv(x)
        return x


 class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
+    def __init__(self, in_channels, with_conv, stride=2, conv_op=ops.Conv2d):
        super().__init__()
        self.with_conv = with_conv
        if self.with_conv:
            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = ops.Conv2d(in_channels,
+            self.conv = conv_op(in_channels,
                                        in_channels,
                                        kernel_size=3,
-                                        stride=2,
+                                        stride=stride,
                                        padding=0)

    def forward(self, x):
        if self.with_conv:
-            pad = (0,1,0,1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            if x.ndim == 4:
+                pad = (0, 1, 0, 1)
+                mode = "constant"
+                x = torch.nn.functional.pad(x, pad, mode=mode, value=0)
+            elif x.ndim == 5:
+                pad = (1, 1, 1, 1, 2, 0)
+                mode = "replicate"
+                x = torch.nn.functional.pad(x, pad, mode=mode)
            x = self.conv(x)
        else:
            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
@ -97,7 +145,7 @@ class Downsample(nn.Module):

 class ResnetBlock(nn.Module):
    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout, temb_channels=512):
+                 dropout, temb_channels=512, conv_op=ops.Conv2d):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
@ -106,7 +154,7 @@ class ResnetBlock(nn.Module):

        self.swish = torch.nn.SiLU(inplace=True)
        self.norm1 = Normalize(in_channels)
-        self.conv1 = ops.Conv2d(in_channels,
+        self.conv1 = conv_op(in_channels,
                                     out_channels,
                                     kernel_size=3,
                                     stride=1,
@ -116,20 +164,20 @@ class ResnetBlock(nn.Module):
                                             out_channels)
        self.norm2 = Normalize(out_channels)
        self.dropout = torch.nn.Dropout(dropout, inplace=True)
-        self.conv2 = ops.Conv2d(out_channels,
+        self.conv2 = conv_op(out_channels,
                                     out_channels,
                                     kernel_size=3,
                                     stride=1,
                                     padding=1)
        if self.in_channels != self.out_channels:
            if self.use_conv_shortcut:
-                self.conv_shortcut = ops.Conv2d(in_channels,
+                self.conv_shortcut = conv_op(in_channels,
                                                     out_channels,
                                                     kernel_size=3,
                                                     stride=1,
                                                     padding=1)
            else:
-                self.nin_shortcut = ops.Conv2d(in_channels,
+                self.nin_shortcut = conv_op(in_channels,
                                                    out_channels,
                                                    kernel_size=1,
                                                    stride=1,
@ -163,7 +211,6 @@ def slice_attention(q, k, v):

    mem_free_total = model_management.get_free_memory(q.device)

-    gb = 1024 ** 3
    tensor_size = q.shape[0] * q.shape[1] * k.shape[2] * q.element_size()
    modifier = 3 if q.element_size() == 2 else 2.5
    mem_required = tensor_size * modifier
@ -196,21 +243,25 @@ def slice_attention(q, k, v):

 def normal_attention(q, k, v):
    # compute attention
-    b,c,h,w = q.shape
+    orig_shape = q.shape
+    b = orig_shape[0]
+    c = orig_shape[1]

-    q = q.reshape(b,c,h*w)
-    q = q.permute(0,2,1)   # b,hw,c
-    k = k.reshape(b,c,h*w) # b,c,hw
-    v = v.reshape(b,c,h*w)
+    q = q.reshape(b, c, -1)
+    q = q.permute(0, 2, 1)   # b,hw,c
+    k = k.reshape(b, c, -1) # b,c,hw
+    v = v.reshape(b, c, -1)

    r1 = slice_attention(q, k, v)
-    h_ = r1.reshape(b,c,h,w)
+    h_ = r1.reshape(orig_shape)
    del r1
    return h_

 def xformers_attention(q, k, v):
    # compute attention
-    B, C, H, W = q.shape
+    orig_shape = q.shape
+    B = orig_shape[0]
+    C = orig_shape[1]
    q, k, v = map(
        lambda t: t.view(B, C, -1).transpose(1, 2).contiguous(),
        (q, k, v),
@ -218,14 +269,16 @@ def xformers_attention(q, k, v):

    try:
        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None)
-        out = out.transpose(1, 2).reshape(B, C, H, W)
-    except NotImplementedError as e:
-        out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(B, C, H, W)
+        out = out.transpose(1, 2).reshape(orig_shape)
+    except NotImplementedError:
+        out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
    return out

 def pytorch_attention(q, k, v):
    # compute attention
-    B, C, H, W = q.shape
+    orig_shape = q.shape
+    B = orig_shape[0]
+    C = orig_shape[1]
    q, k, v = map(
        lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
        (q, k, v),
@ -233,49 +286,52 @@ def pytorch_attention(q, k, v):

    try:
        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
-        out = out.transpose(2, 3).reshape(B, C, H, W)
-    except model_management.OOM_EXCEPTION as e:
+        out = out.transpose(2, 3).reshape(orig_shape)
+    except model_management.OOM_EXCEPTION:
        logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
-        out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(B, C, H, W)
+        out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
    return out


+def vae_attention():
+    if model_management.xformers_enabled_vae():
+        logging.info("Using xformers attention in VAE")
+        return xformers_attention
+    elif model_management.pytorch_attention_enabled_vae():
+        logging.info("Using pytorch attention in VAE")
+        return pytorch_attention
+    else:
+        logging.info("Using split attention in VAE")
+        return normal_attention
+
 class AttnBlock(nn.Module):
-    def __init__(self, in_channels):
+    def __init__(self, in_channels, conv_op=ops.Conv2d):
        super().__init__()
        self.in_channels = in_channels

        self.norm = Normalize(in_channels)
-        self.q = ops.Conv2d(in_channels,
+        self.q = conv_op(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
-        self.k = ops.Conv2d(in_channels,
+        self.k = conv_op(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
-        self.v = ops.Conv2d(in_channels,
+        self.v = conv_op(in_channels,
                                 in_channels,
                                 kernel_size=1,
                                 stride=1,
                                 padding=0)
-        self.proj_out = ops.Conv2d(in_channels,
+        self.proj_out = conv_op(in_channels,
                                        in_channels,
                                        kernel_size=1,
                                        stride=1,
                                        padding=0)

-        if model_management.xformers_enabled_vae():
-            logging.info("Using xformers attention in VAE")
-            self.optimized_attention = xformers_attention
-        elif model_management.pytorch_attention_enabled():
-            logging.info("Using pytorch attention in VAE")
-            self.optimized_attention = pytorch_attention
-        else:
-            logging.info("Using split attention in VAE")
-            self.optimized_attention = normal_attention
+        self.optimized_attention = vae_attention()

    def forward(self, x):
        h_ = x
@ -291,8 +347,8 @@ class AttnBlock(nn.Module):
        return x+h_


-def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
-    return AttnBlock(in_channels)
+def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None, conv_op=ops.Conv2d):
+    return AttnBlock(in_channels, conv_op=conv_op)


 class Model(nn.Module):
@ -451,6 +507,7 @@ class Encoder(nn.Module):
    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 conv3d=False, time_compress=None,
                 **ignore_kwargs):
        super().__init__()
        if use_linear_attn: attn_type = "linear"
@ -461,8 +518,15 @@ class Encoder(nn.Module):
        self.resolution = resolution
        self.in_channels = in_channels

+        if conv3d:
+            conv_op = VideoConv3d
+            mid_attn_conv_op = ops.Conv3d
+        else:
+            conv_op = ops.Conv2d
+            mid_attn_conv_op = ops.Conv2d
+
        # downsampling
-        self.conv_in = ops.Conv2d(in_channels,
+        self.conv_in = conv_op(in_channels,
                                       self.ch,
                                       kernel_size=3,
                                       stride=1,
@ -481,15 +545,20 @@ class Encoder(nn.Module):
                block.append(ResnetBlock(in_channels=block_in,
                                         out_channels=block_out,
                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                                         dropout=dropout,
+                                         conv_op=conv_op))
                block_in = block_out
                if curr_res in attn_resolutions:
-                    attn.append(make_attn(block_in, attn_type=attn_type))
+                    attn.append(make_attn(block_in, attn_type=attn_type, conv_op=conv_op))
            down = nn.Module()
            down.block = block
            down.attn = attn
            if i_level != self.num_resolutions-1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
+                stride = 2
+                if time_compress is not None:
+                    if (self.num_resolutions - 1 - i_level) > math.log2(time_compress):
+                        stride = (1, 2, 2)
+                down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op)
                curr_res = curr_res // 2
            self.down.append(down)

@ -498,16 +567,18 @@ class Encoder(nn.Module):
        self.mid.block_1 = ResnetBlock(in_channels=block_in,
                                       out_channels=block_in,
                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+                                       dropout=dropout,
+                                       conv_op=conv_op)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type, conv_op=mid_attn_conv_op)
        self.mid.block_2 = ResnetBlock(in_channels=block_in,
                                       out_channels=block_in,
                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
+                                       dropout=dropout,
+                                       conv_op=conv_op)

        # end
        self.norm_out = Normalize(block_in)
-        self.conv_out = ops.Conv2d(block_in,
+        self.conv_out = conv_op(block_in,
                                        2*z_channels if double_z else z_channels,
                                        kernel_size=3,
                                        stride=1,
@ -545,9 +616,10 @@ class Decoder(nn.Module):
                 conv_out_op=ops.Conv2d,
                 resnet_op=ResnetBlock,
                 attn_op=AttnBlock,
+                 conv3d=False,
+                 time_compress=None,
                **ignorekwargs):
        super().__init__()
-        if use_linear_attn: attn_type = "linear"
        self.ch = ch
        self.temb_ch = 0
        self.num_resolutions = len(ch_mult)
@ -557,8 +629,15 @@ class Decoder(nn.Module):
        self.give_pre_end = give_pre_end
        self.tanh_out = tanh_out

-        # compute in_ch_mult, block_in and curr_res at lowest res
-        in_ch_mult = (1,)+tuple(ch_mult)
+        if conv3d:
+            conv_op = VideoConv3d
+            conv_out_op = VideoConv3d
+            mid_attn_conv_op = ops.Conv3d
+        else:
+            conv_op = ops.Conv2d
+            mid_attn_conv_op = ops.Conv2d
+
+        # compute block_in and curr_res at lowest res
        block_in = ch*ch_mult[self.num_resolutions-1]
        curr_res = resolution // 2**(self.num_resolutions-1)
        self.z_shape = (1,z_channels,curr_res,curr_res)
@ -566,7 +645,7 @@ class Decoder(nn.Module):
            self.z_shape, np.prod(self.z_shape)))

        # z to block_in
-        self.conv_in = ops.Conv2d(z_channels,
+        self.conv_in = conv_op(z_channels,
                                       block_in,
                                       kernel_size=3,
                                       stride=1,
@ -577,12 +656,14 @@ class Decoder(nn.Module):
        self.mid.block_1 = resnet_op(in_channels=block_in,
                                       out_channels=block_in,
                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
-        self.mid.attn_1 = attn_op(block_in)
+                                       dropout=dropout,
+                                       conv_op=conv_op)
+        self.mid.attn_1 = attn_op(block_in, conv_op=mid_attn_conv_op)
        self.mid.block_2 = resnet_op(in_channels=block_in,
                                       out_channels=block_in,
                                       temb_channels=self.temb_ch,
-                                       dropout=dropout)
+                                       dropout=dropout,
+                                       conv_op=conv_op)

        # upsampling
        self.up = nn.ModuleList()
@ -594,15 +675,21 @@ class Decoder(nn.Module):
                block.append(resnet_op(in_channels=block_in,
                                         out_channels=block_out,
                                         temb_channels=self.temb_ch,
-                                         dropout=dropout))
+                                         dropout=dropout,
+                                         conv_op=conv_op))
                block_in = block_out
                if curr_res in attn_resolutions:
-                    attn.append(attn_op(block_in))
+                    attn.append(attn_op(block_in, conv_op=conv_op))
            up = nn.Module()
            up.block = block
            up.attn = attn
            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
+                scale_factor = 2.0
+                if time_compress is not None:
+                    if i_level > math.log2(time_compress):
+                        scale_factor = (1.0, 2.0, 2.0)
+
+                up.upsample = Upsample(block_in, resamp_with_conv, conv_op=conv_op, scale_factor=scale_factor)
                curr_res = curr_res * 2
            self.up.insert(0, up) # prepend to get consistent order

@ -615,9 +702,6 @@ class Decoder(nn.Module):
                                        padding=1)

    def forward(self, z, **kwargs):
-        #assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-
        # timestep embedding
        temb = None

--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@ -9,7 +9,6 @@ import logging
 from .util import (
    checkpoint,
    avg_pool_nd,
-    zero_module,
    timestep_embedding,
    AlphaBlender,
 )
--- a/comfy/ldm/modules/diffusionmodules/upscaling.py
+++ b/comfy/ldm/modules/diffusionmodules/upscaling.py
@ -4,7 +4,6 @@ import numpy as np
 from functools import partial

 from .util import extract_into_tensor, make_beta_schedule
-from comfy.ldm.util import default


 class AbstractLowScaleModel(nn.Module):
--- a/comfy/ldm/modules/diffusionmodules/util.py
+++ b/comfy/ldm/modules/diffusionmodules/util.py
@ -8,8 +8,8 @@
 # thanks!


-import os
 import math
+import logging
 import torch
 import torch.nn as nn
 import numpy as np
@ -131,7 +131,7 @@ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timestep
    # add one to get the final alpha values right (the ones from first scale to data during sampling)
    steps_out = ddim_timesteps + 1
    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
+        logging.info(f'Selected timesteps for ddim sampler: {steps_out}')
    return steps_out


@ -143,8 +143,8 @@ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
    # according the the formula provided in https://arxiv.org/abs/2010.02502
    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
+        logging.info(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
+        logging.info(f'For the chosen value of eta, which is {eta}, '
              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
    return sigmas, alphas, alphas_prev

--- a/comfy/ldm/modules/distributions/distributions.py
+++ b/comfy/ldm/modules/distributions/distributions.py
@ -30,10 +30,10 @@ class DiagonalGaussianDistribution(object):
        self.std = torch.exp(0.5 * self.logvar)
        self.var = torch.exp(self.logvar)
        if self.deterministic:
-            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+            self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device)

    def sample(self):
-        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device)
        return x

    def kl(self, other=None):
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
@ -22,7 +22,6 @@ except ImportError:
    from typing import Optional, NamedTuple, List
    from typing_extensions import Protocol

-from torch import Tensor
 from typing import List

 from comfy import model_management
@ -172,7 +171,7 @@ def _get_attention_scores_no_kv_chunking(
        del attn_scores
    except model_management.OOM_EXCEPTION:
        logging.warning("ran out of memory while running softmax in  _get_attention_scores_no_kv_chunking, trying slower in place softmax instead")
-        attn_scores -= attn_scores.max(dim=-1, keepdim=True).values
+        attn_scores -= attn_scores.max(dim=-1, keepdim=True).values # noqa: F821 attn_scores is not defined
        torch.exp(attn_scores, out=attn_scores)
        summed = torch.sum(attn_scores, dim=-1, keepdim=True)
        attn_scores /= summed
--- a/comfy/ldm/modules/temporal_ae.py
+++ b/comfy/ldm/modules/temporal_ae.py
@ -1,5 +1,5 @@
 import functools
-from typing import Callable, Iterable, Union
+from typing import Iterable, Union

 import torch
 from einops import rearrange, repeat
@ -194,6 +194,7 @@ def make_time_attn(
    attn_kwargs=None,
    alpha: float = 0,
    merge_strategy: str = "learned",
+    conv_op=ops.Conv2d,
 ):
    return partialclass(
        AttnVideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy
--- a/comfy/ldm/pixart/blocks.py
+++ b/comfy/ldm/pixart/blocks.py
@ -0,0 +1,380 @@
+# Based on:
+# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license]
+# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license]
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, Mlp, timestep_embedding
+from comfy.ldm.modules.attention import optimized_attention
+
+# if model_management.xformers_enabled():
+#     import xformers.ops
+#     if int((xformers.__version__).split(".")[2].split("+")[0]) >= 28:
+#         block_diagonal_mask_from_seqlens = xformers.ops.fmha.attn_bias.BlockDiagonalMask.from_seqlens
+#     else:
+#         block_diagonal_mask_from_seqlens = xformers.ops.fmha.BlockDiagonalMask.from_seqlens
+
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+def t2i_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(self, d_model, num_heads, attn_drop=0., proj_drop=0., dtype=None, device=None, operations=None, **kwargs):
+        super(MultiHeadCrossAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = d_model // num_heads
+
+        self.q_linear = operations.Linear(d_model, d_model, dtype=dtype, device=device)
+        self.kv_linear = operations.Linear(d_model, d_model*2, dtype=dtype, device=device)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = operations.Linear(d_model, d_model, dtype=dtype, device=device)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, cond, mask=None):
+        # query/value: img tokens; key: condition; mask: if padding tokens
+        B, N, C = x.shape
+
+        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(2)
+
+        assert mask is None # TODO?
+        # # TODO: xformers needs separate mask logic here
+        # if model_management.xformers_enabled():
+        #     attn_bias = None
+        #     if mask is not None:
+        #         attn_bias = block_diagonal_mask_from_seqlens([N] * B, mask)
+        #     x = xformers.ops.memory_efficient_attention(q, k, v, p=0, attn_bias=attn_bias)
+        # else:
+        #     q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v),)
+        #     attn_mask = None
+        #     mask = torch.ones(())
+        #     if mask is not None and len(mask) > 1:
+        #         # Create equivalent of xformer diagonal block mask, still only correct for square masks
+        #         # But depth doesn't matter as tensors can expand in that dimension
+        #         attn_mask_template = torch.ones(
+        #             [q.shape[2] // B, mask[0]],
+        #             dtype=torch.bool,
+        #             device=q.device
+        #         )
+        #         attn_mask = torch.block_diag(attn_mask_template)
+        #
+        #         # create a mask on the diagonal for each mask in the batch
+        #         for _ in range(B - 1):
+        #             attn_mask = torch.block_diag(attn_mask, attn_mask_template)
+        #     x = optimized_attention(q, k, v, self.num_heads, mask=attn_mask, skip_reshape=True)
+
+        x = optimized_attention(q.view(B, -1, C), k.view(B, -1, C), v.view(B, -1, C), self.num_heads, mask=None)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class AttentionKVCompress(nn.Module):
+    """Multi-head Attention block with KV token compression and qk norm."""
+    def __init__(self, dim, num_heads=8, qkv_bias=True, sampling='conv', sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **kwargs):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+        """
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+
+        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+        self.sampling=sampling    # ['conv', 'ave', 'uniform', 'uniform_every']
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1 and sampling == 'conv':
+            # Avg Conv Init.
+            self.sr = operations.Conv2d(dim, dim, groups=dim, kernel_size=sr_ratio, stride=sr_ratio, dtype=dtype, device=device)
+            # self.sr.weight.data.fill_(1/sr_ratio**2)
+            # self.sr.bias.data.zero_()
+            self.norm = operations.LayerNorm(dim, dtype=dtype, device=device)
+        if qk_norm:
+            self.q_norm = operations.LayerNorm(dim, dtype=dtype, device=device)
+            self.k_norm = operations.LayerNorm(dim, dtype=dtype, device=device)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+
+    def downsample_2d(self, tensor, H, W, scale_factor, sampling=None):
+        if sampling is None or scale_factor == 1:
+            return tensor
+        B, N, C = tensor.shape
+
+        if sampling == 'uniform_every':
+            return tensor[:, ::scale_factor], int(N // scale_factor)
+
+        tensor = tensor.reshape(B, H, W, C).permute(0, 3, 1, 2)
+        new_H, new_W = int(H / scale_factor), int(W / scale_factor)
+        new_N = new_H * new_W
+
+        if sampling == 'ave':
+            tensor = F.interpolate(
+                tensor, scale_factor=1 / scale_factor, mode='nearest'
+            ).permute(0, 2, 3, 1)
+        elif sampling == 'uniform':
+            tensor = tensor[:, :, ::scale_factor, ::scale_factor].permute(0, 2, 3, 1)
+        elif sampling == 'conv':
+            tensor = self.sr(tensor).reshape(B, C, -1).permute(0, 2, 1)
+            tensor = self.norm(tensor)
+        else:
+            raise ValueError
+
+        return tensor.reshape(B, new_N, C).contiguous(), new_N
+
+    def forward(self, x, mask=None, HW=None, block_id=None):
+        B, N, C = x.shape # 2 4096 1152
+        new_N = N
+        if HW is None:
+            H = W = int(N ** 0.5)
+        else:
+            H, W = HW
+        qkv = self.qkv(x).reshape(B, N, 3, C)
+
+        q, k, v = qkv.unbind(2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+
+        # KV compression
+        if self.sr_ratio > 1:
+            k, new_N = self.downsample_2d(k, H, W, self.sr_ratio, sampling=self.sampling)
+            v, new_N = self.downsample_2d(v, H, W, self.sr_ratio, sampling=self.sampling)
+
+        q = q.reshape(B, N, self.num_heads, C // self.num_heads)
+        k = k.reshape(B, new_N, self.num_heads, C // self.num_heads)
+        v = v.reshape(B, new_N, self.num_heads, C // self.num_heads)
+
+        if mask is not None:
+            raise NotImplementedError("Attn mask logic not added for self attention")
+
+        # This is never called at the moment
+        # attn_bias = None
+        # if mask is not None:
+        #     attn_bias = torch.zeros([B * self.num_heads, q.shape[1], k.shape[1]], dtype=q.dtype, device=q.device)
+        #     attn_bias.masked_fill_(mask.squeeze(1).repeat(self.num_heads, 1, 1) == 0, float('-inf'))
+
+        # attention 2
+        q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v),)
+        x = optimized_attention(q, k, v, self.num_heads, mask=None, skip_reshape=True)
+
+        x = x.view(B, N, C)
+        x = self.proj(x)
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device)
+        )
+
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+class T2IFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.scale_shift_table = nn.Parameter(torch.randn(2, hidden_size) / hidden_size ** 0.5)
+        self.out_channels = out_channels
+
+    def forward(self, x, t):
+        shift, scale = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t[:, None]).chunk(2, dim=1)
+        x = t2i_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class MaskFinalLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, final_hidden_size, c_emb_size, patch_size, out_channels, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_final = operations.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(c_emb_size, 2 * final_hidden_size, bias=True, dtype=dtype, device=device)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class DecoderLayer(nn.Module):
+    """
+    The final layer of PixArt.
+    """
+    def __init__(self, hidden_size, decoder_hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.norm_decoder = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, decoder_hidden_size, bias=True, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device)
+        )
+    def forward(self, x, t):
+        shift, scale = self.adaLN_modulation(t).chunk(2, dim=1)
+        x = modulate(self.norm_decoder(x), shift, scale)
+        x = self.linear(x)
+        return x
+
+
+class SizeEmbedder(TimestepEmbedder):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
+        super().__init__(hidden_size=hidden_size, frequency_embedding_size=frequency_embedding_size, operations=operations)
+        self.mlp = nn.Sequential(
+            operations.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+        self.outdim = hidden_size
+
+    def forward(self, s, bs):
+        if s.ndim == 1:
+            s = s[:, None]
+        assert s.ndim == 2
+        if s.shape[0] != bs:
+            s = s.repeat(bs//s.shape[0], 1)
+            assert s.shape[0] == bs
+        b, dims = s.shape[0], s.shape[1]
+        s = rearrange(s, "b d -> (b d)")
+        s_freq = timestep_embedding(s, self.frequency_embedding_size)
+        s_emb = self.mlp(s_freq.to(s.dtype))
+        s_emb = rearrange(s_emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        return s_emb
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob, dtype=None, device=None, operations=None):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = operations.Embedding(num_classes + use_cfg_embedding, hidden_size, dtype=dtype, device=device),
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0]).cuda() < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+
+
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.y_proj = Mlp(
+            in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer,
+            dtype=dtype, device=device, operations=operations,
+        )
+        self.register_buffer("y_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return caption
+
+    def forward(self, caption, train, force_drop_ids=None):
+        if train:
+            assert caption.shape[2:] == self.y_embedding.shape
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids)
+        caption = self.y_proj(caption)
+        return caption
+
+
+class CaptionEmbedderDoubleBr(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, act_layer=nn.GELU(approximate='tanh'), token_num=120, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.proj = Mlp(
+            in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size, act_layer=act_layer,
+            dtype=dtype, device=device, operations=operations,
+        )
+        self.embedding = nn.Parameter(torch.randn(1, in_channels) / 10 ** 0.5)
+        self.y_embedding = nn.Parameter(torch.randn(token_num, in_channels) / 10 ** 0.5)
+        self.uncond_prob = uncond_prob
+
+    def token_drop(self, global_caption, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(global_caption.shape[0]).cuda() < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        global_caption = torch.where(drop_ids[:, None], self.embedding, global_caption)
+        caption = torch.where(drop_ids[:, None, None, None], self.y_embedding, caption)
+        return global_caption, caption
+
+    def forward(self, caption, train, force_drop_ids=None):
+        assert caption.shape[2: ] == self.y_embedding.shape
+        global_caption = caption.mean(dim=2).squeeze()
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            global_caption, caption = self.token_drop(global_caption, caption, force_drop_ids)
+        y_embed = self.proj(global_caption)
+        return y_embed, caption
--- a/comfy/ldm/pixart/pixartms.py
+++ b/comfy/ldm/pixart/pixartms.py
@ -0,0 +1,256 @@
+# Based on:
+# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license]
+# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license]
+import torch
+import torch.nn as nn
+
+from .blocks import (
+    t2i_modulate,
+    CaptionEmbedder,
+    AttentionKVCompress,
+    MultiHeadCrossAttention,
+    T2IFinalLayer,
+    SizeEmbedder,
+)
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch
+
+
+def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32):
+    grid_h, grid_w = torch.meshgrid(
+        torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation,
+        torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation,
+        indexing='ij'
+    )
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype)
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype)
+    emb = torch.cat([emb_w, emb_h], dim=1)  # (H*W, D)
+    return emb
+
+class PixArtMSBlock(nn.Module):
+    """
+    A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None,
+                 sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.attn = AttentionKVCompress(
+            hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio,
+            qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs
+        )
+        self.cross_attn = MultiHeadCrossAttention(
+            hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs
+        )
+        self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        # to be compatible with lower version pytorch
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
+
+    def forward(self, x, y, t, mask=None, HW=None, **kwargs):
+        B, N, C = x.shape
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1)
+        x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW))
+        x = x + self.cross_attn(x, y, mask)
+        x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
+
+        return x
+
+
+### Core PixArt Model ###
+class PixArtMS(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+            self,
+            input_size=32,
+            patch_size=2,
+            in_channels=4,
+            hidden_size=1152,
+            depth=28,
+            num_heads=16,
+            mlp_ratio=4.0,
+            class_dropout_prob=0.1,
+            learn_sigma=True,
+            pred_sigma=True,
+            drop_path: float = 0.,
+            caption_channels=4096,
+            pe_interpolation=None,
+            pe_precision=None,
+            config=None,
+            model_max_length=120,
+            micro_condition=True,
+            qk_norm=False,
+            kv_compress_config=None,
+            dtype=None,
+            device=None,
+            operations=None,
+            **kwargs,
+    ):
+        nn.Module.__init__(self)
+        self.dtype = dtype
+        self.pred_sigma = pred_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if pred_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.pe_interpolation = pe_interpolation
+        self.pe_precision = pe_precision
+        self.hidden_size = hidden_size
+        self.depth = depth
+
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.t_block = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)
+        )
+        self.x_embedder = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_channels,
+            embed_dim=hidden_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            operations=operations
+        )
+        self.t_embedder = TimestepEmbedder(
+            hidden_size, dtype=dtype, device=device, operations=operations,
+        )
+        self.y_embedder = CaptionEmbedder(
+            in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob,
+            act_layer=approx_gelu, token_num=model_max_length,
+            dtype=dtype, device=device, operations=operations,
+        )
+
+        self.micro_conditioning = micro_condition
+        if self.micro_conditioning:
+            self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
+            self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
+
+        # For fixed sin-cos embedding:
+        # num_patches = (input_size // patch_size) * (input_size // patch_size)
+        # self.base_size = input_size // self.patch_size
+        # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
+
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
+        if kv_compress_config is None:
+            kv_compress_config = {
+                'sampling': None,
+                'scale_factor': 1,
+                'kv_compress_layer': [],
+            }
+        self.blocks = nn.ModuleList([
+            PixArtMSBlock(
+                hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
+                sampling=kv_compress_config['sampling'],
+                sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1,
+                qk_norm=qk_norm,
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+            for i in range(depth)
+        ])
+        self.final_layer = T2IFinalLayer(
+            hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
+        )
+
+    def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs):
+        """
+        Original forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) conditioning
+        ar: (N, 1): aspect ratio
+        cs: (N ,2) size conditioning for height/width
+        """
+        B, C, H, W = x.shape
+        c_res = (H + W) // 2
+        pe_interpolation = self.pe_interpolation
+        if pe_interpolation is None or self.pe_precision is not None:
+            # calculate pe_interpolation on-the-fly
+            pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0)
+
+        pos_embed = get_2d_sincos_pos_embed_torch(
+            self.hidden_size,
+            h=(H // self.patch_size),
+            w=(W // self.patch_size),
+            pe_interpolation=pe_interpolation,
+            base_size=((round(c_res / 64) * 64) // self.patch_size),
+            device=x.device,
+            dtype=x.dtype,
+        ).unsqueeze(0)
+
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep, x.dtype)  # (N, D)
+
+        if self.micro_conditioning and (c_size is not None and c_ar is not None):
+            bs = x.shape[0]
+            c_size = self.csize_embedder(c_size, bs)  # (N, D)
+            c_ar = self.ar_embedder(c_ar, bs)  # (N, D)
+            t = t + torch.cat([c_size, c_ar], dim=1)
+
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, D)
+
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = None
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+        for block in self.blocks:
+            x = block(x, y, t0, y_lens, (H, W), **kwargs)  # (N, T, D)
+
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, H, W)  # (N, out_channels, H, W)
+
+        return x
+
+    def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs):
+        B, C, H, W = x.shape
+
+        # Fallback for missing microconds
+        if self.micro_conditioning:
+            if c_size is None:
+                c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1)
+
+            if c_ar is None:
+                c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1)
+
+        ## Still accepts the input w/o that dim but returns garbage
+        if len(context.shape) == 3:
+            context = context.unsqueeze(1)
+
+        ## run original forward pass
+        out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar)
+
+        ## only return EPS
+        if self.pred_sigma:
+            return out[:, :self.in_channels]
+        return out
+
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = h // self.patch_size
+        w = w // self.patch_size
+        assert h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
--- a/comfy/ldm/util.py
+++ b/comfy/ldm/util.py
@ -1,4 +1,5 @@
 import importlib
+import logging

 import torch
 from torch import optim
@ -23,7 +24,7 @@ def log_txt_as_img(wh, xc, size=10):
        try:
            draw.text((0, 0), lines, fill="black", font=font)
        except UnicodeEncodeError:
-            print("Cant encode string for logging. Skipping.")
+            logging.warning("Cant encode string for logging. Skipping.")

        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
        txts.append(txt)
@ -65,7 +66,7 @@ def mean_flat(tensor):
 def count_params(model, verbose=False):
    total_params = sum(p.numel() for p in model.parameters())
    if verbose:
-        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+        logging.info(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
    return total_params


@ -133,7 +134,6 @@ class AdamWwithEMAandWings(optim.Optimizer):
            exp_avgs = []
            exp_avg_sqs = []
            ema_params_with_grad = []
-            state_sums = []
            max_exp_avg_sqs = []
            state_steps = []
            amsgrad = group['amsgrad']
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -0,0 +1,485 @@
+# original version: https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/model.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from einops import repeat
+
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
+import comfy.ldm.common_dit
+import comfy.model_management
+
+
+def sinusoidal_embedding_1d(dim, position):
+    # preprocess
+    assert dim % 2 == 0
+    half = dim // 2
+    position = position.type(torch.float32)
+
+    # calculation
+    sinusoid = torch.outer(
+        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    return x
+
+
+class WanSelfAttention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6, operation_settings={}):
+        assert dim % num_heads == 0
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.eps = eps
+
+        # layers
+        self.q = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.k = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.v = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.o = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.norm_q = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+        self.norm_k = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+
+    def forward(self, x, freqs):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, num_heads, C / num_heads]
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+
+        # query, key, value function
+        def qkv_fn(x):
+            q = self.norm_q(self.q(x)).view(b, s, n, d)
+            k = self.norm_k(self.k(x)).view(b, s, n, d)
+            v = self.v(x).view(b, s, n * d)
+            return q, k, v
+
+        q, k, v = qkv_fn(x)
+        q, k = apply_rope(q, k, freqs)
+
+        x = optimized_attention(
+            q.view(b, s, n * d),
+            k.view(b, s, n * d),
+            v,
+            heads=self.num_heads,
+        )
+
+        x = self.o(x)
+        return x
+
+
+class WanT2VCrossAttention(WanSelfAttention):
+
+    def forward(self, x, context):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+        """
+        # compute query, key, value
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(context))
+        v = self.v(context)
+
+        # compute attention
+        x = optimized_attention(q, k, v, heads=self.num_heads)
+
+        x = self.o(x)
+        return x
+
+
+class WanI2VCrossAttention(WanSelfAttention):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 eps=1e-6, operation_settings={}):
+        super().__init__(dim, num_heads, window_size, qk_norm, eps, operation_settings=operation_settings)
+
+        self.k_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.v_img = operation_settings.get("operations").Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        # self.alpha = nn.Parameter(torch.zeros((1, )))
+        self.norm_k_img = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()
+
+    def forward(self, x, context):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            context(Tensor): Shape [B, L2, C]
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+
+        # compute query, key, value
+        q = self.norm_q(self.q(x))
+        k = self.norm_k(self.k(context))
+        v = self.v(context)
+        k_img = self.norm_k_img(self.k_img(context_img))
+        v_img = self.v_img(context_img)
+        img_x = optimized_attention(q, k_img, v_img, heads=self.num_heads)
+        # compute attention
+        x = optimized_attention(q, k, v, heads=self.num_heads)
+
+        # output
+        x = x + img_x
+        x = self.o(x)
+        return x
+
+
+WAN_CROSSATTENTION_CLASSES = {
+    't2v_cross_attn': WanT2VCrossAttention,
+    'i2v_cross_attn': WanI2VCrossAttention,
+}
+
+
+class WanAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 cross_attn_type,
+                 dim,
+                 ffn_dim,
+                 num_heads,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=False,
+                 eps=1e-6, operation_settings={}):
+        super().__init__()
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+
+        # layers
+        self.norm1 = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
+                                          eps, operation_settings=operation_settings)
+        self.norm3 = operation_settings.get("operations").LayerNorm(
+            dim, eps,
+            elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if cross_attn_norm else nn.Identity()
+        self.cross_attn = WAN_CROSSATTENTION_CLASSES[cross_attn_type](dim,
+                                                                      num_heads,
+                                                                      (-1, -1),
+                                                                      qk_norm,
+                                                                      eps, operation_settings=operation_settings)
+        self.norm2 = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.ffn = nn.Sequential(
+            operation_settings.get("operations").Linear(dim, ffn_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.GELU(approximate='tanh'),
+            operation_settings.get("operations").Linear(ffn_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+        # modulation
+        self.modulation = nn.Parameter(torch.empty(1, 6, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+    def forward(
+        self,
+        x,
+        e,
+        freqs,
+        context,
+    ):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L, C]
+            e(Tensor): Shape [B, 6, C]
+            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
+        """
+        # assert e.dtype == torch.float32
+
+        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
+        # assert e[0].dtype == torch.float32
+
+        # self-attention
+        y = self.self_attn(
+            self.norm1(x) * (1 + e[1]) + e[0],
+            freqs)
+
+        x = x + y * e[2]
+
+        # cross-attention & ffn
+        x = x + self.cross_attn(self.norm3(x), context)
+        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
+        x = x + y * e[5]
+        return x
+
+
+class Head(nn.Module):
+
+    def __init__(self, dim, out_dim, patch_size, eps=1e-6, operation_settings={}):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim
+        self.patch_size = patch_size
+        self.eps = eps
+
+        # layers
+        out_dim = math.prod(patch_size) * out_dim
+        self.norm = operation_settings.get("operations").LayerNorm(dim, eps, elementwise_affine=False, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.head = operation_settings.get("operations").Linear(dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+        # modulation
+        self.modulation = nn.Parameter(torch.empty(1, 2, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+    def forward(self, x, e):
+        r"""
+        Args:
+            x(Tensor): Shape [B, L1, C]
+            e(Tensor): Shape [B, C]
+        """
+        # assert e.dtype == torch.float32
+        e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e.unsqueeze(1)).chunk(2, dim=1)
+        x = (self.head(self.norm(x) * (1 + e[1]) + e[0]))
+        return x
+
+
+class MLPProj(torch.nn.Module):
+
+    def __init__(self, in_dim, out_dim, operation_settings={}):
+        super().__init__()
+
+        self.proj = torch.nn.Sequential(
+            operation_settings.get("operations").LayerNorm(in_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), operation_settings.get("operations").Linear(in_dim, in_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+            torch.nn.GELU(), operation_settings.get("operations").Linear(in_dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+            operation_settings.get("operations").LayerNorm(out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds)
+        return clip_extra_context_tokens
+
+
+class WanModel(torch.nn.Module):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    def __init__(self,
+                 model_type='t2v',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 image_model=None,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+        r"""
+        Initialize the diffusion model backbone.
+
+        Args:
+            model_type (`str`, *optional*, defaults to 't2v'):
+                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
+            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
+                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
+            text_len (`int`, *optional*, defaults to 512):
+                Fixed length for text embeddings
+            in_dim (`int`, *optional*, defaults to 16):
+                Input video channels (C_in)
+            dim (`int`, *optional*, defaults to 2048):
+                Hidden dimension of the transformer
+            ffn_dim (`int`, *optional*, defaults to 8192):
+                Intermediate dimension in feed-forward network
+            freq_dim (`int`, *optional*, defaults to 256):
+                Dimension for sinusoidal time embeddings
+            text_dim (`int`, *optional*, defaults to 4096):
+                Input dimension for text embeddings
+            out_dim (`int`, *optional*, defaults to 16):
+                Output video channels (C_out)
+            num_heads (`int`, *optional*, defaults to 16):
+                Number of attention heads
+            num_layers (`int`, *optional*, defaults to 32):
+                Number of transformer blocks
+            window_size (`tuple`, *optional*, defaults to (-1, -1)):
+                Window size for local attention (-1 indicates global attention)
+            qk_norm (`bool`, *optional*, defaults to True):
+                Enable query/key normalization
+            cross_attn_norm (`bool`, *optional*, defaults to False):
+                Enable cross-attention normalization
+            eps (`float`, *optional*, defaults to 1e-6):
+                Epsilon value for normalization layers
+        """
+
+        super().__init__()
+        self.dtype = dtype
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        assert model_type in ['t2v', 'i2v']
+        self.model_type = model_type
+
+        self.patch_size = patch_size
+        self.text_len = text_len
+        self.in_dim = in_dim
+        self.dim = dim
+        self.ffn_dim = ffn_dim
+        self.freq_dim = freq_dim
+        self.text_dim = text_dim
+        self.out_dim = out_dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.window_size = window_size
+        self.qk_norm = qk_norm
+        self.cross_attn_norm = cross_attn_norm
+        self.eps = eps
+
+        # embeddings
+        self.patch_embedding = operations.Conv3d(
+            in_dim, dim, kernel_size=patch_size, stride=patch_size, device=operation_settings.get("device"), dtype=torch.float32)
+        self.text_embedding = nn.Sequential(
+            operations.Linear(text_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.GELU(approximate='tanh'),
+            operations.Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+        self.time_embedding = nn.Sequential(
+            operations.Linear(freq_dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")), nn.SiLU(), operations.Linear(dim, dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+        self.time_projection = nn.Sequential(nn.SiLU(), operations.Linear(dim, dim * 6, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+
+        # blocks
+        cross_attn_type = 't2v_cross_attn' if model_type == 't2v' else 'i2v_cross_attn'
+        self.blocks = nn.ModuleList([
+            WanAttentionBlock(cross_attn_type, dim, ffn_dim, num_heads,
+                              window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
+            for _ in range(num_layers)
+        ])
+
+        # head
+        self.head = Head(dim, out_dim, patch_size, eps, operation_settings=operation_settings)
+
+        d = dim // num_heads
+        self.rope_embedder = EmbedND(dim=d, theta=10000.0, axes_dim=[d - 4 * (d // 6), 2 * (d // 6), 2 * (d // 6)])
+
+        if model_type == 'i2v':
+            self.img_emb = MLPProj(1280, dim, operation_settings=operation_settings)
+        else:
+            self.img_emb = None
+
+    def forward_orig(
+        self,
+        x,
+        t,
+        context,
+        clip_fea=None,
+        freqs=None,
+        transformer_options={},
+    ):
+        r"""
+        Forward pass through the diffusion model
+
+        Args:
+            x (Tensor):
+                List of input video tensors with shape [B, C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [B, L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            clip_fea (Tensor, *optional*):
+                CLIP image features for image-to-video mode
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        # embeddings
+        x = self.patch_embedding(x.float()).to(x.dtype)
+        grid_sizes = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+
+        # context
+        context = self.text_embedding(context)
+
+        if clip_fea is not None and self.img_emb is not None:
+            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+            context = torch.concat([context_clip, context], dim=1)
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"])
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = block(x, e=e0, freqs=freqs, context=context)
+
+        # head
+        x = self.head(x, e)
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x
+
+    def forward(self, x, timestep, context, clip_fea=None, transformer_options={},**kwargs):
+        bs, c, t, h, w = x.shape
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
+        patch_size = self.patch_size
+        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
+        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
+        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
+        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
+        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
+        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
+        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
+        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
+
+        freqs = self.rope_embedder(img_ids).movedim(1, 2)
+        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options)[:, :, :t, :h, :w]
+
+    def unpatchify(self, x, grid_sizes):
+        r"""
+        Reconstruct video tensors from patch embeddings.
+
+        Args:
+            x (List[Tensor]):
+                List of patchified features, each with shape [L, C_out * prod(patch_size)]
+            grid_sizes (Tensor):
+                Original spatial-temporal grid dimensions before patching,
+                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
+
+        Returns:
+            List[Tensor]:
+                Reconstructed video tensors with shape [L, C_out, F, H / 8, W / 8]
+        """
+
+        c = self.out_dim
+        u = x
+        b = u.shape[0]
+        u = u[:, :math.prod(grid_sizes)].view(b, *grid_sizes, *self.patch_size, c)
+        u = torch.einsum('bfhwpqrc->bcfphqwr', u)
+        u = u.reshape(b, c, *[i * j for i, j in zip(grid_sizes, self.patch_size)])
+        return u
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@ -0,0 +1,567 @@
+# original version: https://github.com/Wan-Video/Wan2.1/blob/main/wan/modules/vae.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from comfy.ldm.modules.diffusionmodules.model import vae_attention
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+CACHE_T = 2
+
+
+class CausalConv3d(ops.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+
+        return super().forward(x)
+
+
+class RMS_norm(nn.Module):
+
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else None
+
+    def forward(self, x):
+        return F.normalize(
+            x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma.to(x) + (self.bias.to(x) if self.bias is not None else 0)
+
+
+class Upsample(nn.Upsample):
+
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+
+
+class Resample(nn.Module):
+
+    def __init__(self, dim, mode):
+        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
+                        'downsample3d')
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == 'upsample2d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                ops.Conv2d(dim, dim // 2, 3, padding=1))
+        elif mode == 'upsample3d':
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
+                ops.Conv2d(dim, dim // 2, 3, padding=1))
+            self.time_conv = CausalConv3d(
+                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+
+        elif mode == 'downsample2d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == 'downsample3d':
+            self.resample = nn.Sequential(
+                nn.ZeroPad2d((0, 1, 0, 1)),
+                ops.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(
+                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == 'upsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = 'Rep'
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] != 'Rep':
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                                cache_x.device), cache_x
+                        ],
+                                            dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[
+                            idx] is not None and feat_cache[idx] == 'Rep':
+                        cache_x = torch.cat([
+                            torch.zeros_like(cache_x).to(cache_x.device),
+                            cache_x
+                        ],
+                                            dim=2)
+                    if feat_cache[idx] == 'Rep':
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
+                                    3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.resample(x)
+        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
+
+        if self.mode == 'downsample3d':
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
+                    #     # cache last frame of last two chunk
+                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+                    x = self.time_conv(
+                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+    def init_weight(self, conv):
+        conv_weight = conv.weight
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
+        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False), nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1))
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
+            if in_dim != out_dim else nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = ops.Conv2d(dim, dim * 3, 1)
+        self.proj = ops.Conv2d(dim, dim, 1)
+        self.optimized_attention = vae_attention()
+
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x = self.norm(x)
+        # compute query, key, value
+
+        q, k, v = self.to_qkv(x).chunk(3, dim=1)
+        x = self.optimized_attention(q, k, v)
+
+        # output
+        x = self.proj(x)
+        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
+        return x + identity
+
+
+class Encoder3d(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
+
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            for _ in range(num_res_blocks):
+                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    downsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # downsample block
+            if i != len(dim_mult) - 1:
+                mode = 'downsample3d' if temperal_downsample[
+                    i] else 'downsample2d'
+                downsamples.append(Resample(out_dim, mode=mode))
+                scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout))
+
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1))
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+class Decoder3d(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_upsample=[False, True, True],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2**(len(dim_mult) - 2)
+
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout))
+
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i == 1 or i == 2 or i == 3:
+                in_dim = in_dim // 2
+            for _ in range(num_res_blocks + 1):
+                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+                if scale in attn_scales:
+                    upsamples.append(AttentionBlock(out_dim))
+                in_dim = out_dim
+
+            # upsample block
+            if i != len(dim_mult) - 1:
+                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
+                upsamples.append(Resample(out_dim, mode=mode))
+                scale *= 2.0
+        self.upsamples = nn.Sequential(*upsamples)
+
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False), nn.SiLU(),
+            CausalConv3d(out_dim, 3, 3, padding=1))
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([
+                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                        cache_x.device), cache_x
+                ],
+                                    dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
+                            cache_x.device), cache_x
+                    ],
+                                        dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+
+
+class WanVAE(nn.Module):
+
+    def __init__(self,
+                 dim=128,
+                 z_dim=4,
+                 dim_mult=[1, 2, 4, 4],
+                 num_res_blocks=2,
+                 attn_scales=[],
+                 temperal_downsample=[True, True, False],
+                 dropout=0.0):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
+                                 attn_scales, self.temperal_upsample, dropout)
+
+    def forward(self, x):
+        mu, log_var = self.encode(x)
+        z = self.reparameterize(mu, log_var)
+        x_recon = self.decode(z)
+        return x_recon, mu, log_var
+
+    def encode(self, x):
+        self.clear_cache()
+        ## cache
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        ## 对encode输入的x，按时间拆分为1、4、4、4....
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx)
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        self.clear_cache()
+        return mu
+
+    def decode(self, z):
+        self.clear_cache()
+        # z: [b,c,t,h,w]
+
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+            else:
+                out_ = self.decoder(
+                    x[:, :, i:i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+        self.clear_cache()
+        return out
+
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        #cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -307,7 +307,6 @@ def model_lora_keys_unet(model, key_map={}):
            if k.endswith(".weight"):
                key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
                key_map["lora_unet_{}".format(key_lora)] = k
-                key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
                key_map["{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
            else:
                key_map["{}".format(k)] = k #generic lora format for not .weight without any weird key names
@ -327,6 +326,13 @@ def model_lora_keys_unet(model, key_map={}):
                    diffusers_lora_key = diffusers_lora_key[:-2]
                key_map[diffusers_lora_key] = unet_key

+    if isinstance(model, comfy.model_base.StableCascade_C):
+        for k in sdk:
+            if k.startswith("diffusion_model."):
+                if k.endswith(".weight"):
+                    key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
+                    key_map["lora_prior_unet_{}".format(key_lora)] = k
+
    if isinstance(model, comfy.model_base.SD3): #Diffusers lora SD3
        diffusers_keys = comfy.utils.mmdit_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
        for k in diffusers_keys:
@ -344,7 +350,6 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = "lycoris_{}".format(k[:-len(".weight")].replace(".", "_")) #simpletuner lycoris format
                key_map[key_lora] = to

-
    if isinstance(model, comfy.model_base.AuraFlow): #Diffusers lora AuraFlow
        diffusers_keys = comfy.utils.auraflow_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
        for k in diffusers_keys:
@ -353,6 +358,20 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = "transformer.{}".format(k[:-len(".weight")]) #simpletrainer and probably regular diffusers lora format
                key_map[key_lora] = to

+    if isinstance(model, comfy.model_base.PixArt):
+        diffusers_keys = comfy.utils.pixart_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
+        for k in diffusers_keys:
+            if k.endswith(".weight"):
+                to = diffusers_keys[k]
+                key_lora = "transformer.{}".format(k[:-len(".weight")]) #default format
+                key_map[key_lora] = to
+
+                key_lora = "base_model.model.{}".format(k[:-len(".weight")]) #diffusers training script
+                key_map[key_lora] = to
+
+                key_lora = "unet.base_model.model.{}".format(k[:-len(".weight")]) #old reference peft script
+                key_map[key_lora] = to
+
    if isinstance(model, comfy.model_base.HunyuanDiT):
        for k in sdk:
            if k.startswith("diffusion_model.") and k.endswith(".weight"):
@ -374,6 +393,18 @@ def model_lora_keys_unet(model, key_map={}):
                key_lora = k[len("diffusion_model."):-len(".weight")]
                key_map["{}".format(key_lora)] = k

+    if isinstance(model, comfy.model_base.HunyuanVideo):
+        for k in sdk:
+            if k.startswith("diffusion_model.") and k.endswith(".weight"):
+                # diffusion-pipe lora format
+                key_lora = k
+                key_lora = key_lora.replace("_mod.lin.", "_mod.linear.").replace("_attn.qkv.", "_attn_qkv.").replace("_attn.proj.", "_attn_proj.")
+                key_lora = key_lora.replace("mlp.0.", "mlp.fc1.").replace("mlp.2.", "mlp.fc2.")
+                key_lora = key_lora.replace(".modulation.lin.", ".modulation.linear.")
+                key_lora = key_lora[len("diffusion_model."):-len(".weight")]
+                key_map["transformer.{}".format(key_lora)] = k
+                key_map["diffusion_model.{}".format(key_lora)] = k  # Old loras
+
    return key_map


--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@ -1,4 +1,5 @@
 import torch
+import comfy.utils


 def convert_lora_bfl_control(sd): #BFL loras for Flux
@ -11,7 +12,13 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
    return sd_out


+def convert_lora_wan_fun(sd): #Wan Fun loras
+    return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
+
+
 def convert_lora(sd):
    if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
        return convert_lora_bfl_control(sd)
+    if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
+        return convert_lora_wan_fun(sd)
    return sd
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -26,11 +26,18 @@ from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAug
 from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
 import comfy.ldm.genmo.joint_model.asymm_models_joint
 import comfy.ldm.aura.mmdit
+import comfy.ldm.pixart.pixartms
 import comfy.ldm.hydit.models
 import comfy.ldm.audio.dit
 import comfy.ldm.audio.embedders
 import comfy.ldm.flux.model
 import comfy.ldm.lightricks.model
+import comfy.ldm.hunyuan_video.model
+import comfy.ldm.cosmos.model
+import comfy.ldm.lumina.model
+import comfy.ldm.wan.model
+import comfy.ldm.hunyuan3d.model
+import comfy.ldm.hidream.model

 import comfy.model_management
 import comfy.patcher_extension
@ -53,6 +60,7 @@ class ModelType(Enum):
    FLOW = 6
    V_PREDICTION_CONTINUOUS = 7
    FLUX = 8
+    IMG_TO_IMG = 9


 from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
@ -83,6 +91,8 @@ def model_sampling(model_config, model_type):
    elif model_type == ModelType.FLUX:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingFlux
+    elif model_type == ModelType.IMG_TO_IMG:
+        c = comfy.model_sampling.IMG_TO_IMG

    class ModelSampling(s, c):
        pass
@ -103,7 +113,7 @@ class BaseModel(torch.nn.Module):

        if not unet_config.get("disable_unet_model_creation", False):
            if model_config.custom_operations is None:
-                fp8 = model_config.optimizations.get("fp8", model_config.scaled_fp8 is not None)
+                fp8 = model_config.optimizations.get("fp8", False)
                operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8)
            else:
                operations = model_config.custom_operations
@ -134,6 +144,7 @@ class BaseModel(torch.nn.Module):
    def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        sigma = t
        xc = self.model_sampling.calculate_input(sigma, x)
+
        if c_concat is not None:
            xc = torch.cat([xc] + [c_concat], dim=1)

@ -145,7 +156,9 @@ class BaseModel(torch.nn.Module):

        xc = xc.to(dtype)
        t = self.model_sampling.timestep(t).float()
+        if context is not None:
            context = context.to(dtype)
+
        extra_conds = {}
        for o in kwargs:
            extra = kwargs[o]
@ -154,15 +167,16 @@ class BaseModel(torch.nn.Module):
                    extra = extra.to(dtype)
            extra_conds[o] = extra

+        t = self.process_timestep(t, x=x, **extra_conds)
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
        return self.model_sampling.calculate_denoised(sigma, model_output, x)

+    def process_timestep(self, timestep, **kwargs):
+        return timestep
+
    def get_dtype(self):
        return self.diffusion_model.dtype

-    def is_adm(self):
-        return self.adm_channels > 0
-
    def encode_adm(self, **kwargs):
        return None

@ -181,14 +195,20 @@ class BaseModel(torch.nn.Module):

            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
+                if noise.ndim == 5:
+                    if concat_latent_image.shape[-3] < noise.shape[-3]:
+                        concat_latent_image = torch.nn.functional.pad(concat_latent_image, (0, 0, 0, 0, 0, noise.shape[-3] - concat_latent_image.shape[-3]), "constant", 0)
+                    else:
+                        concat_latent_image = concat_latent_image[:, :, :noise.shape[-3]]

            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])

            if denoise_mask is not None:
                if len(denoise_mask.shape) == len(noise.shape):
-                    denoise_mask = denoise_mask[:,:1]
+                    denoise_mask = denoise_mask[:, :1]

-                denoise_mask = denoise_mask.reshape((-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1]))
+                num_dim = noise.ndim - 2
+                denoise_mask = denoise_mask.reshape((-1, 1) + tuple(denoise_mask.shape[-num_dim:]))
                if denoise_mask.shape[-2:] != noise.shape[-2:]:
                    denoise_mask = utils.common_upscale(denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center")
                denoise_mask = utils.resize_to_batch_size(denoise_mask.round(), noise.shape[0])
@ -198,12 +218,21 @@ class BaseModel(torch.nn.Module):
                    if ck == "mask":
                        cond_concat.append(denoise_mask.to(device))
                    elif ck == "masked_image":
-                        cond_concat.append(concat_latent_image.to(device)) #NOTE: the latent_image should be masked by the mask in pixel space
+                        cond_concat.append(concat_latent_image.to(device))  # NOTE: the latent_image should be masked by the mask in pixel space
+                    elif ck == "mask_inverted":
+                        cond_concat.append(1.0 - denoise_mask.to(device))
                else:
                    if ck == "mask":
-                        cond_concat.append(torch.ones_like(noise)[:,:1])
+                        cond_concat.append(torch.ones_like(noise)[:, :1])
                    elif ck == "masked_image":
                        cond_concat.append(self.blank_inpaint_image_like(noise))
+                    elif ck == "mask_inverted":
+                        cond_concat.append(torch.zeros_like(noise)[:, :1])
+                if ck == "concat_image":
+                    if concat_latent_image is not None:
+                        cond_concat.append(concat_latent_image.to(device))
+                    else:
+                        cond_concat.append(torch.zeros_like(noise))
            data = torch.cat(cond_concat, dim=1)
            return data
        return None
@ -291,6 +320,9 @@ class BaseModel(torch.nn.Module):
            return blank_image
        self.blank_inpaint_image_like = blank_inpaint_image_like

+    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
+        return self.model_sampling.noise_scaling(sigma.reshape([sigma.shape[0]] + [1] * (len(noise.shape) - 1)), noise, latent_image)
+
    def memory_required(self, input_shape):
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
            dtype = self.get_dtype()
@ -427,7 +459,6 @@ class SVD_img2vid(BaseModel):

        latent_image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
-        device = kwargs["device"]

        if latent_image is None:
            latent_image = torch.zeros_like(noise)
@ -539,6 +570,10 @@ class SD_X4Upscaler(BaseModel):

        out['c_concat'] = comfy.conds.CONDNoiseShape(image)
        out['y'] = comfy.conds.CONDRegular(noise_level)
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out

 class IP2P:
@ -571,6 +606,19 @@ class SDXL_instructpix2pix(IP2P, SDXL):
        else:
            self.process_ip2p_image_in = lambda image: image #diffusers ip2p

+class Lotus(BaseModel):
+    def extra_conds(self, **kwargs):
+        out = {}
+        cross_attn = kwargs.get("cross_attn", None)
+        out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
+        device = kwargs["device"]
+        task_emb = torch.tensor([1, 0]).float().to(device)
+        task_emb = torch.cat([torch.sin(task_emb), torch.cos(task_emb)]).unsqueeze(0)
+        out['y'] = comfy.conds.CONDRegular(task_emb)
+        return out
+
+    def __init__(self, model_config, model_type=ModelType.IMG_TO_IMG, device=None):
+        super().__init__(model_config, model_type, device=device)

 class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
@ -687,6 +735,7 @@ class StableAudio1(BaseModel):
                sd["{}{}".format(k, l)] = s[l]
        return sd

+
 class HunyuanDiT(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hydit.models.HunYuanDiT)
@ -711,14 +760,31 @@ class HunyuanDiT(BaseModel):

        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
-        crop_w = kwargs.get("crop_w", 0)
-        crop_h = kwargs.get("crop_h", 0)
        target_width = kwargs.get("target_width", width)
        target_height = kwargs.get("target_height", height)

        out['image_meta_size'] = comfy.conds.CONDRegular(torch.FloatTensor([[height, width, target_height, target_width, 0, 0]]))
        return out

+class PixArt(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.pixart.pixartms.PixArtMS)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        width = kwargs.get("width", None)
+        height = kwargs.get("height", None)
+        if width is not None and height is not None:
+            out["c_size"] = comfy.conds.CONDRegular(torch.FloatTensor([[height, width]]))
+            out["c_ar"] = comfy.conds.CONDRegular(torch.FloatTensor([[kwargs.get("aspect_ratio", height/width)]]))
+
+        return out
+
 class Flux(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux)
@ -755,7 +821,6 @@ class Flux(BaseModel):
            mask = torch.ones_like(noise)[:, :1]

        mask = torch.mean(mask, dim=1, keepdim=True)
-        print(mask.shape)
        mask = utils.common_upscale(mask.to(device), noise.shape[-1] * 8, noise.shape[-2] * 8, "bilinear", "center")
        mask = mask.view(mask.shape[0], mask.shape[2] // 8, 8, mask.shape[3] // 8, 8).permute(0, 2, 4, 1, 3).reshape(mask.shape[0], -1, mask.shape[2] // 8, mask.shape[3] // 8)
        mask = utils.resize_to_batch_size(mask, noise.shape[0])
@ -769,7 +834,20 @@ class Flux(BaseModel):
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
-        out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([kwargs.get("guidance", 3.5)]))
+        # upscale the attention mask, since now we
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            shape = kwargs["noise"].shape
+            mask_ref_size = kwargs["attention_mask_img_shape"]
+            # the model will pad to the patch size, and then divide
+            # essentially dividing and rounding up
+            (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
+            attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+
+        guidance = kwargs.get("guidance", 3.5)
+        if guidance is not None:
+            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        return out

 class GenmoMochi(BaseModel):
@ -800,9 +878,199 @@ class LTXV(BaseModel):
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)

-        guiding_latent = kwargs.get("guiding_latent", None)
-        if guiding_latent is not None:
-            out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
-
        out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
+
+        denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+        if denoise_mask is not None:
+            out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
+
+        keyframe_idxs = kwargs.get("keyframe_idxs", None)
+        if keyframe_idxs is not None:
+            out['keyframe_idxs'] = comfy.conds.CONDRegular(keyframe_idxs)
+
+        return out
+
+    def process_timestep(self, timestep, x, denoise_mask=None, **kwargs):
+        if denoise_mask is None:
+            return timestep
+        return self.diffusion_model.patchifier.patchify(((denoise_mask) * timestep.view([timestep.shape[0]] + [1] * (denoise_mask.ndim - 1)))[:, :1])[0]
+
+    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
+        return latent_image
+
+class HunyuanVideo(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan_video.model.HunyuanVideo)
+
+    def encode_adm(self, **kwargs):
+        return kwargs["pooled_output"]
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        guidance = kwargs.get("guidance", 6.0)
+        if guidance is not None:
+            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
+
+        guiding_frame_index = kwargs.get("guiding_frame_index", None)
+        if guiding_frame_index is not None:
+            out['guiding_frame_index'] = comfy.conds.CONDRegular(torch.FloatTensor([guiding_frame_index]))
+
+        return out
+
+    def scale_latent_inpaint(self, latent_image, **kwargs):
+        return latent_image
+
+class HunyuanVideoI2V(HunyuanVideo):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device)
+        self.concat_keys = ("concat_image", "mask_inverted")
+
+    def scale_latent_inpaint(self, latent_image, **kwargs):
+        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
+
+class HunyuanVideoSkyreelsI2V(HunyuanVideo):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device)
+        self.concat_keys = ("concat_image",)
+
+    def scale_latent_inpaint(self, latent_image, **kwargs):
+        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)
+
+class CosmosVideo(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cosmos.model.GeneralDIT)
+        self.image_to_video = image_to_video
+        if self.image_to_video:
+            self.concat_keys = ("mask_inverted",)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        out['fps'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", None))
+        return out
+
+    def scale_latent_inpaint(self, sigma, noise, latent_image, **kwargs):
+        sigma = sigma.reshape([sigma.shape[0]] + [1] * (len(noise.shape) - 1))
+        sigma_noise_augmentation = 0 #TODO
+        if sigma_noise_augmentation != 0:
+            latent_image = latent_image + noise
+        latent_image = self.model_sampling.calculate_input(torch.tensor([sigma_noise_augmentation], device=latent_image.device, dtype=latent_image.dtype), latent_image)
+        return latent_image * ((sigma ** 2 + self.model_sampling.sigma_data ** 2) ** 0.5)
+
+class Lumina2(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            if torch.numel(attention_mask) != attention_mask.sum():
+                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+            out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item()))
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
+
+class WAN21(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
+        self.image_to_video = image_to_video
+
+    def concat_cond(self, **kwargs):
+        noise = kwargs.get("noise", None)
+        extra_channels = self.diffusion_model.patch_embedding.weight.shape[1] - noise.shape[1]
+        if extra_channels == 0:
+            return None
+
+        image = kwargs.get("concat_latent_image", None)
+        device = kwargs["device"]
+
+        if image is None:
+            shape_image = list(noise.shape)
+            shape_image[1] = extra_channels
+            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+        else:
+            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            for i in range(0, image.shape[1], 16):
+                image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16])
+            image = utils.resize_to_batch_size(image, noise.shape[0])
+
+        if not self.image_to_video or extra_channels == image.shape[1]:
+            return image
+
+        if image.shape[1] > (extra_channels - 4):
+            image = image[:, :(extra_channels - 4)]
+
+        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+        if mask is None:
+            mask = torch.zeros_like(noise)[:, :4]
+        else:
+            if mask.shape[1] != 4:
+                mask = torch.mean(mask, dim=1, keepdim=True)
+            mask = 1.0 - mask
+            mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            if mask.shape[-3] < noise.shape[-3]:
+                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
+            if mask.shape[1] == 1:
+                mask = mask.repeat(1, 4, 1, 1, 1)
+            mask = utils.resize_to_batch_size(mask, noise.shape[0])
+
+        return torch.cat((mask, image), dim=1)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        clip_vision_output = kwargs.get("clip_vision_output", None)
+        if clip_vision_output is not None:
+            out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.penultimate_hidden_states)
+        return out
+
+class Hunyuan3Dv2(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        guidance = kwargs.get("guidance", 5.0)
+        if guidance is not None:
+            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
+        return out
+
+class HiDream(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream.model.HiDreamImageTransformer2DModel)
+
+    def encode_adm(self, **kwargs):
+        return kwargs["pooled_output"]
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        conditioning_llama3 = kwargs.get("conditioning_llama3", None)
+        if conditioning_llama3 is not None:
+            out['encoder_hidden_states_llama3'] = comfy.conds.CONDRegular(conditioning_llama3)
        return out
--- a/Show More
+++ b/Show More