2025-04-20 03:13:30 +00:00
365 changed files with 46119 additions and 444982 deletions
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@ -28,12 +28,12 @@ def pull(repo, remote_name='origin', branch='master'):

                if repo.index.conflicts is not None:
                    for conflict in repo.index.conflicts:
-                        print('Conflicts found in:', conflict[0].path)  # noqa: T201
+                        print('Conflicts found in:', conflict[0].path)
                    raise AssertionError('Conflicts, ahhhhh!!')

                user = repo.default_signature
                tree = repo.index.write_tree()
-                repo.create_commit('HEAD',
+                commit = repo.create_commit('HEAD',
                                            user,
                                            user,
                                            'Merge!',
@ -49,18 +49,18 @@ repo_path = str(sys.argv[1])
 repo = pygit2.Repository(repo_path)
 ident = pygit2.Signature('comfyui', 'comfy@ui')
 try:
-    print("stashing current changes")  # noqa: T201
+    print("stashing current changes")
    repo.stash(ident)
 except KeyError:
-    print("nothing to stash")  # noqa: T201
+    print("nothing to stash")
 backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S'))
-print("creating backup branch: {}".format(backup_branch_name))  # noqa: T201
+print("creating backup branch: {}".format(backup_branch_name))
 try:
    repo.branches.local.create(backup_branch_name, repo.head.peel())
 except:
    pass

-print("checking out master branch")  # noqa: T201
+print("checking out master branch")
 branch = repo.lookup_branch('master')
 if branch is None:
    ref = repo.lookup_reference('refs/remotes/origin/master')
@ -72,29 +72,10 @@ else:
    ref = repo.lookup_reference(branch.name)
    repo.checkout(ref)

-print("pulling latest changes")  # noqa: T201
+print("pulling latest changes")
 pull(repo)

-if "--stable" in sys.argv:
-    def latest_tag(repo):
-        versions = []
-        for k in repo.references:
-            try:
-                prefix = "refs/tags/v"
-                if k.startswith(prefix):
-                    version = list(map(int, k[len(prefix):].split(".")))
-                    versions.append((version[0] * 10000000000 + version[1] * 100000 + version[2], k))
-            except:
-                pass
-        versions.sort()
-        if len(versions) > 0:
-            return versions[-1][1]
-        return None
-    latest_tag = latest_tag(repo)
-    if latest_tag is not None:
-        repo.checkout(latest_tag)
-
-print("Done!")  # noqa: T201
+print("Done!")

 self_update = True
 if len(sys.argv) > 2:
@ -134,13 +115,3 @@ if not os.path.exists(req_path) or not files_equal(repo_req_path, req_path):
        shutil.copy(repo_req_path, req_path)
    except:
        pass
-
-
-stable_update_script = os.path.join(repo_path, ".ci/update_windows/update_comfyui_stable.bat")
-stable_update_script_to = os.path.join(cur_path, "update_comfyui_stable.bat")
-
-try:
-    if not file_size(stable_update_script_to) > 10:
-        shutil.copy(stable_update_script, stable_update_script_to)
-except:
-    pass
--- a/.ci/update_windows/update_comfyui_stable.bat
+++ b/.ci/update_windows/update_comfyui_stable.bat
@ -1,8 +0,0 @@
-@echo off
-..\python_embeded\python.exe .\update.py ..\ComfyUI\ --stable
-if exist update_new.py (
-  move /y update_new.py update.py
-  echo Running updater again since it got updated.
-  ..\python_embeded\python.exe .\update.py ..\ComfyUI\ --skip_self_update --stable
-)
-if "%~1"=="" pause
--- a/.ci/windows_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_base_files/README_VERY_IMPORTANT.txt
@ -14,7 +14,7 @@ run_cpu.bat

 IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints

-You can download the stable diffusion 1.5 one from: https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/blob/main/v1-5-pruned-emaonly-fp16.safetensors
+You can download the stable diffusion 1.5 one from: https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt


 RECOMMENDED WAY TO UPDATE:
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast.bat
@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast
-pause
--- a/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
+++ b/.ci/windows_nightly_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat
@ -1,2 +0,0 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation
-pause
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +0,0 @@
-/web/assets/** linguist-generated
-/web/** linguist-vendored
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -1,8 +1,5 @@
 blank_issues_enabled: true
 contact_links:
-  - name: ComfyUI Frontend Issues
-    url: https://github.com/Comfy-Org/ComfyUI_frontend/issues
-    about: Issues related to the ComfyUI frontend (display issues, user interaction bugs), please go to the frontend repo to file the issue
  - name: ComfyUI Matrix Space
    url: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
    about: The ComfyUI Matrix Space is available for support and general discussion related to ComfyUI (Matrix is like Discord but open source).
--- a/.github/workflows/pullrequest-ci-run.yml
+++ b/.github/workflows/pullrequest-ci-run.yml
@ -23,7 +23,7 @@ jobs:
            runner_label: [self-hosted, Linux]
            flags: ""
          - os: windows
-            runner_label: [self-hosted, Windows]
+            runner_label: [self-hosted, win]
            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
@ -35,19 +35,3 @@ jobs:
          torch_version: ${{ matrix.torch_version }}
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}
-          use_prior_commit: 'true'
-  comment:
-    if: ${{ github.event.label.name == 'Run-CI-Test' }}
-    runs-on: ubuntu-latest
-    permissions:
-      pull-requests: write
-    steps:
-      - uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              body: '(Automated Bot Message) CI Tests are running, you can view the results at https://ci.comfy.org/?branch=${{ github.event.pull_request.number }}%2Fmerge'
-            })
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -3,8 +3,8 @@ name: Python Linting
 on: [push, pull_request]

 jobs:
-  ruff:
-    name: Run Ruff
+  pylint:
+    name: Run Pylint
    runs-on: ubuntu-latest

    steps:
@ -16,8 +16,8 @@ jobs:
      with:
        python-version: 3.x

-    - name: Install Ruff
-      run: pip install ruff
+    - name: Install Pylint
+      run: pip install pylint

-    - name: Run Ruff
-      run: ruff check .
+    - name: Run Pylint
+      run: pylint --rcfile=.pylintrc $(find . -type f -name "*.py")
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@ -12,12 +12,12 @@ on:
        description: 'CUDA version'
        required: true
        type: string
-        default: "126"
+        default: "121"
      python_minor:
        description: 'Python minor version'
        required: true
        type: string
-        default: "12"
+        default: "11"
      python_patch:
        description: 'Python patch version'
        required: true
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@ -1,21 +0,0 @@
-name: 'Close stale issues'
-on:
-  schedule:
-    # Run daily at 430 am PT
-    - cron: '30 11 * * *'
-permissions:
-  issues: write
-
-jobs:
-  stale:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/stale@v9
-        with:
-          stale-issue-message: "This issue is being marked stale because it has not had any activity for 30 days. Reply below within 7 days if your issue still isn't solved, and it will be left open. Otherwise, the issue will be closed automatically."
-          days-before-stale: 30
-          days-before-close: 7
-          stale-issue-label: 'Stale'
-          only-labels: 'User Support'
-          exempt-all-assignees: true
-          exempt-all-milestones: true
--- a/.github/workflows/test-browser.yml
+++ b/.github/workflows/test-browser.yml
@ -0,0 +1,76 @@
+# This is a temporary action during frontend TS migration.
+# This file should be removed after TS migration is completed.
+# The browser test is here to ensure TS repo is working the same way as the
+# current JS code.
+# If you are adding UI feature, please sync your changes to the TS repo:
+# huchenlei/ComfyUI_frontend and update test expectation files accordingly.
+name: Playwright Browser Tests CI
+
+on:
+  push:
+    branches: [ main, master ]
+  pull_request:
+    branches: [ main, master ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout ComfyUI
+      uses: actions/checkout@v4
+      with:
+        repository: "comfyanonymous/ComfyUI"
+        path: "ComfyUI"
+    - name: Checkout ComfyUI_frontend
+      uses: actions/checkout@v4
+      with:
+        repository: "huchenlei/ComfyUI_frontend"
+        path: "ComfyUI_frontend"
+        ref: "fcc54d803e5b6a9b08a462a1d94899318c96dcbb"
+    - uses: actions/setup-node@v3
+      with:
+        node-version: lts/*
+    - uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - name: Install requirements
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements.txt
+        pip install wait-for-it
+      working-directory: ComfyUI
+    - name: Start ComfyUI server
+      run: |
+        python main.py --cpu 2>&1 | tee console_output.log &
+        wait-for-it --service 127.0.0.1:8188 -t 600
+      working-directory: ComfyUI
+    - name: Install ComfyUI_frontend dependencies
+      run: |
+        npm ci
+      working-directory: ComfyUI_frontend
+    - name: Install Playwright Browsers
+      run: npx playwright install --with-deps
+      working-directory: ComfyUI_frontend
+    - name: Run Playwright tests
+      run: npx playwright test
+      working-directory: ComfyUI_frontend
+    - name: Check for unhandled exceptions in server log
+      run: |
+        if grep -qE "Exception|Error" console_output.log; then
+          echo "Unhandled exception/error found in server log."
+          exit 1
+        fi
+      working-directory: ComfyUI
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: playwright-report
+        path: ComfyUI_frontend/playwright-report/
+        retention-days: 30
+    - uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: console-output
+        path: ComfyUI/console_output.log
+        retention-days: 30
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python ${{ matrix.python-version }}
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@ -20,8 +20,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # os: [macos, linux, windows]
-        os: [macos, linux]
+        os: [macos, linux, windows]
        python_version: ["3.9", "3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
@ -32,9 +31,9 @@ jobs:
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
-          # - os: windows
-          #   runner_label: [self-hosted, Windows]
-          #   flags: ""
+          - os: windows
+            runner_label: [self-hosted, win]
+            flags: ""
    runs-on: ${{ matrix.runner_label }}
    steps:
      - name: Test Workflows
@ -46,28 +45,28 @@ jobs:
          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
          comfyui_flags: ${{ matrix.flags }}

-  # test-win-nightly:
-  #   strategy:
-  #     fail-fast: true
-  #     matrix:
-  #       os: [windows]
-  #       python_version: ["3.9", "3.10", "3.11", "3.12"]
-  #       cuda_version: ["12.1"]
-  #       torch_version: ["nightly"]
-  #       include:
-  #         - os: windows
-  #           runner_label: [self-hosted, Windows]
-  #           flags: ""
-  #   runs-on: ${{ matrix.runner_label }}
-  #   steps:
-  #     - name: Test Workflows
-  #       uses: comfy-org/comfy-action@main
-  #       with:
-  #         os: ${{ matrix.os }}
-  #         python_version: ${{ matrix.python_version }}
-  #         torch_version: ${{ matrix.torch_version }}
-  #         google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
-  #         comfyui_flags: ${{ matrix.flags }}
+  test-win-nightly:
+    strategy:
+      fail-fast: true
+      matrix:
+        os: [windows]
+        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        cuda_version: ["12.1"]
+        torch_version: ["nightly"]
+        include:
+          - os: windows
+            runner_label: [self-hosted, win]
+            flags: ""
+    runs-on: ${{ matrix.runner_label }}
+    steps:
+      - name: Test Workflows
+        uses: comfy-org/comfy-action@main
+        with:
+          os: ${{ matrix.os }}
+          python_version: ${{ matrix.python_version }}
+          torch_version: ${{ matrix.torch_version }}
+          google_credentials: ${{ secrets.GCS_SERVICE_ACCOUNT_JSON }}
+          comfyui_flags: ${{ matrix.flags }}

  test-unix-nightly:
    strategy:
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@ -1,45 +0,0 @@
-name: Test server launches without errors
-
-on:
-  push:
-    branches: [ main, master ]
-  pull_request:
-    branches: [ main, master ]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout ComfyUI
-      uses: actions/checkout@v4
-      with:
-        repository: "comfyanonymous/ComfyUI"
-        path: "ComfyUI"
-    - uses: actions/setup-python@v4
-      with:
-        python-version: '3.9'
-    - name: Install requirements
-      run: |
-        python -m pip install --upgrade pip
-        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-        pip install -r requirements.txt
-        pip install wait-for-it
-      working-directory: ComfyUI
-    - name: Start ComfyUI server
-      run: |
-        python main.py --cpu 2>&1 | tee console_output.log &
-        wait-for-it --service 127.0.0.1:8188 -t 30
-      working-directory: ComfyUI
-    - name: Check for unhandled exceptions in server log
-      run: |
-        if grep -qE "Exception|Error" console_output.log; then
-          echo "Unhandled exception/error found in server log."
-          exit 1
-        fi
-      working-directory: ComfyUI
-    - uses: actions/upload-artifact@v4
-      if: always()
-      with:
-        name: console-output
-        path: ComfyUI/console_output.log
-        retention-days: 30
--- a/.github/workflows/test-unit.yml
+++ b/.github/workflows/test-unit.yml
@ -1,29 +1,29 @@
-name: Unit Tests
+name: Tests CI

-on:
-  push:
-    branches: [ main, master ]
-  pull_request:
-    branches: [ main, master ]
+on: [push, pull_request]

 jobs:
  test:
-    strategy:
-      matrix:
-        os: [ubuntu-latest, windows-latest, macos-latest]
-    runs-on: ${{ matrix.os }}
-    continue-on-error: true
+    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
-    - name: Set up Python      
-      uses: actions/setup-python@v4
+    - uses: actions/setup-node@v3
      with:
-        python-version: '3.12'
+        node-version: 18
+    - uses: actions/setup-python@v4
+      with: 
+        python-version: '3.10'
    - name: Install requirements
      run: |
        python -m pip install --upgrade pip
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
        pip install -r requirements.txt
+    - name: Run Tests
+      run: | 
+        npm ci
+        npm run test:generate
+        npm test -- --verbose
+      working-directory: ./tests-ui
    - name: Run Unit Tests
      run: |
        pip install -r tests-unit/requirements.txt
--- a/.github/workflows/update-version.yml
+++ b/.github/workflows/update-version.yml
@ -1,58 +0,0 @@
-name: Update Version File
-
-on:
-  pull_request:
-    paths:
-      - "pyproject.toml"
-    branches:
-      - master
-
-jobs:
-  update-version:
-    runs-on: ubuntu-latest
-    # Don't run on fork PRs
-    if: github.event.pull_request.head.repo.full_name == github.repository
-    permissions:
-      pull-requests: write
-      contents: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-
-      - name: Update comfyui_version.py
-        run: |
-          # Read version from pyproject.toml and update comfyui_version.py
-          python -c '
-          import tomllib
-
-          # Read version from pyproject.toml
-          with open("pyproject.toml", "rb") as f:
-              config = tomllib.load(f)
-              version = config["project"]["version"]
-
-          # Write version to comfyui_version.py
-          with open("comfyui_version.py", "w") as f:
-              f.write("# This file is automatically generated by the build process when version is\n")
-              f.write("# updated in pyproject.toml.\n")
-              f.write(f"__version__ = \"{version}\"\n")
-          '
-
-      - name: Commit changes
-        run: |
-          git config --local user.name "github-actions"
-          git config --local user.email "github-actions@github.com"
-          git fetch origin ${{ github.head_ref }}
-          git checkout -B ${{ github.head_ref }} origin/${{ github.head_ref }}
-          git add comfyui_version.py
-          git diff --quiet && git diff --staged --quiet || git commit -m "chore: Update comfyui_version.py to match pyproject.toml"
-          git push origin HEAD:${{ github.head_ref }}
--- a/.github/workflows/windows_release_dependencies.yml
+++ b/.github/workflows/windows_release_dependencies.yml
@ -12,18 +12,18 @@ on:
        description: 'extra dependencies'
        required: false
        type: string
-        default: ""
+        default: "\"numpy<2\""
      cu:
        description: 'cuda version'
        required: true
        type: string
-        default: "126"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "12"
+        default: "11"

      python_patch:
        description: 'python patch version'
--- a/.github/workflows/windows_release_nightly_pytorch.yml
+++ b/.github/workflows/windows_release_nightly_pytorch.yml
@ -7,19 +7,19 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "128"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "13"
+        default: "12"

      python_patch:
        description: 'python patch version'
        required: true
        type: string
-        default: "2"
+        default: "4"
 #  push:
 #    branches:
 #      - master
@ -34,7 +34,7 @@ jobs:
    steps:
        - uses: actions/checkout@v4
          with:
-            fetch-depth: 30
+            fetch-depth: 0
            persist-credentials: false
        - uses: actions/setup-python@v5
          with:
@ -67,14 +67,13 @@ jobs:
            mkdir update
            cp -r ComfyUI/.ci/update_windows/* ./update/
            cp -r ComfyUI/.ci/windows_base_files/* ./
-            cp -r ComfyUI/.ci/windows_nightly_base_files/* ./

            echo "call update_comfyui.bat nopause
            ..\python_embeded\python.exe -s -m pip install --upgrade --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2
            pause" > ./update/update_comfyui_and_python_dependencies.bat
            cd ..

-            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=512m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
+            "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=8 -mfb=64 -md=32m -ms=on -mf=BCJ2 ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI_windows_portable_nightly_pytorch
            mv ComfyUI_windows_portable_nightly_pytorch.7z ComfyUI/ComfyUI_windows_portable_nvidia_or_cpu_nightly_pytorch.7z

            cd ComfyUI_windows_portable_nightly_pytorch
--- a/.github/workflows/windows_release_package.yml
+++ b/.github/workflows/windows_release_package.yml
@ -7,13 +7,13 @@ on:
        description: 'cuda version'
        required: true
        type: string
-        default: "126"
+        default: "124"

      python_minor:
        description: 'python minor version'
        required: true
        type: string
-        default: "12"
+        default: "11"

      python_patch:
        description: 'python patch version'
--- a/.gitignore
+++ b/.gitignore
@ -12,7 +12,6 @@ extra_model_paths.yaml
 .vscode/
 .idea/
 venv/
-.venv/
 /web/extensions/*
 !/web/extensions/logging.js.example
 !/web/extensions/core/
@ -20,4 +19,3 @@ venv/
 /user/
 *.log
 web_custom_versions/
-.DS_Store
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,3 @@
+[MESSAGES CONTROL]
+disable=all
+enable=eval-used
--- a/23
+++ b/23
@ -1,24 +1 @@
-# Admins
 *       @comfyanonymous
-
-# Note: Github teams syntax cannot be used here as the repo is not owned by Comfy-Org.
-# Inlined the team members for now.
-
-# Maintainers
-*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
-
-# Python web server
-/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
-/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
-/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
-
-# Node developers
-/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
-/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
--- a/README.md
+++ b/README.md
@ -1,78 +1,18 @@
-<div align="center">
+ComfyUI
+=======
+The most powerful and modular stable diffusion GUI and backend.
+-----------
+![ComfyUI Screenshot](comfyui_screenshot.png)

-# ComfyUI
-**The most powerful and modular visual AI engine and application.**
-
-
-[![Website][website-shield]][website-url]
-[![Dynamic JSON Badge][discord-shield]][discord-url]
-[![Matrix][matrix-shield]][matrix-url]
-<br>
-[![][github-release-shield]][github-release-link]
-[![][github-release-date-shield]][github-release-link]
-[![][github-downloads-shield]][github-downloads-link]
-[![][github-downloads-latest-shield]][github-downloads-link]
-
-[matrix-shield]: https://img.shields.io/badge/Matrix-000000?style=flat&logo=matrix&logoColor=white
-[matrix-url]: https://app.element.io/#/room/%23comfyui_space%3Amatrix.org
-[website-shield]: https://img.shields.io/badge/ComfyOrg-4285F4?style=flat
-[website-url]: https://www.comfy.org/
-<!-- Workaround to display total user from https://github.com/badges/shields/issues/4500#issuecomment-2060079995 -->
-[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
-[discord-url]: https://www.comfy.org/discord
-
-[github-release-shield]: https://img.shields.io/github/v/release/comfyanonymous/ComfyUI?style=flat&sort=semver
-[github-release-link]: https://github.com/comfyanonymous/ComfyUI/releases
-[github-release-date-shield]: https://img.shields.io/github/release-date/comfyanonymous/ComfyUI?style=flat
-[github-downloads-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/total?style=flat
-[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
-[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
-
-![ComfyUI Screenshot](https://github.com/user-attachments/assets/7ccaf2c1-9b72-41ae-9a89-5688c94b7abe)
-</div>
-
-ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
-
-## Get Started
-
-#### [Desktop Application](https://www.comfy.org/download)
- The easiest way to get started. 
- Available on Windows & macOS.
-
-#### [Windows Portable Package](#installing)
- Get the latest commits and completely portable.
- Available on Windows.
-
-#### [Manual Install](#manual-install-windows-linux)
-Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).
-
-## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
-See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
+This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
+### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)

+### [Installing ComfyUI](#installing)

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Image Models
-   - SD1.x, SD2.x,
-   - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
-   - [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
-   - [SD3 and SD3.5](https://comfyanonymous.github.io/ComfyUI_examples/sd3/)
-   - Pixart Alpha and Sigma
-   - [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
-   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
+- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/), [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/), [SD3](https://comfyanonymous.github.io/ComfyUI_examples/sd3/) and [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
-   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
-   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
- Video Models
-   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
-   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
-   - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/)
-   - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/)
-   - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/)
-   - [Wan 2.1](https://comfyanonymous.github.io/ComfyUI_examples/wan/)
- 3D Models
-   - [Hunyuan3D 2.0](https://docs.comfy.org/tutorials/3d/hunyuan3D-2)
- [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Smart memory management: can automatically run models on GPUs with as low as 1GB vram.
@ -92,6 +32,9 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
 - [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
+- [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
+- [AuraFlow](https://comfyanonymous.github.io/ComfyUI_examples/aura_flow/)
+- [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@ -103,43 +46,36 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git

 | Keybind                            | Explanation                                                                                                        |
 |------------------------------------|--------------------------------------------------------------------------------------------------------------------|
-| `Ctrl` + `Enter`                      | Queue up current graph for generation                                                                              |
-| `Ctrl` + `Shift` + `Enter`              | Queue up current graph as first for generation                                                                     |
-| `Ctrl` + `Alt` + `Enter`                | Cancel current generation                                                                                          |
-| `Ctrl` + `Z`/`Ctrl` + `Y`                 | Undo/Redo                                                                                                          |
-| `Ctrl` + `S`                          | Save workflow                                                                                                      |
-| `Ctrl` + `O`                          | Load workflow                                                                                                      |
-| `Ctrl` + `A`                          | Select all nodes                                                                                                   |
-| `Alt `+ `C`                           | Collapse/uncollapse selected nodes                                                                                 |
-| `Ctrl` + `M`                          | Mute/unmute selected nodes                                                                                         |
-| `Ctrl` + `B`                           | Bypass selected nodes (acts like the node was removed from the graph and the wires reconnected through)            |
-| `Delete`/`Backspace`                   | Delete selected nodes                                                                                              |
-| `Ctrl` + `Backspace`                   | Delete the current graph                                                                                           |
-| `Space`                              | Move the canvas around when held and moving the cursor                                                             |
-| `Ctrl`/`Shift` + `Click`                 | Add clicked node to selection                                                                                      |
-| `Ctrl` + `C`/`Ctrl` + `V`                  | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes)                     |
-| `Ctrl` + `C`/`Ctrl` + `Shift` + `V`          | Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
-| `Shift` + `Drag`                       | Move multiple selected nodes at the same time                                                                      |
-| `Ctrl` + `D`                           | Load default graph                                                                                                 |
-| `Alt` + `+`                          | Canvas Zoom in                                                                                                     |
-| `Alt` + `-`                          | Canvas Zoom out                                                                                                    |
-| `Ctrl` + `Shift` + LMB + Vertical drag | Canvas Zoom in/out                                                                                                 |
-| `P`                                  | Pin/Unpin selected nodes                                                                                           |
-| `Ctrl` + `G`                           | Group selected nodes                                                                                               |
-| `Q`                                 | Toggle visibility of the queue                                                                                     |
-| `H`                                  | Toggle visibility of history                                                                                       |
-| `R`                                  | Refresh graph                                                                                                      |
-| `F`                                  | Show/Hide menu                                                                                                      |
-| `.`                                  | Fit view to selection (Whole graph when nothing is selected)                                                        |
+| Ctrl + Enter                       | Queue up current graph for generation                                                                              |
+| Ctrl + Shift + Enter               | Queue up current graph as first for generation                                                                     |
+| Ctrl + Z/Ctrl + Y                  | Undo/Redo                                                                                                          |
+| Ctrl + S                           | Save workflow                                                                                                      |
+| Ctrl + O                           | Load workflow                                                                                                      |
+| Ctrl + A                           | Select all nodes                                                                                                   |
+| Alt + C                            | Collapse/uncollapse selected nodes                                                                                 |
+| Ctrl + M                           | Mute/unmute selected nodes                                                                                         |
+| Ctrl + B                           | Bypass selected nodes (acts like the node was removed from the graph and the wires reconnected through)            |
+| Delete/Backspace                   | Delete selected nodes                                                                                              |
+| Ctrl + Backspace                   | Delete the current graph                                                                                           |
+| Space                              | Move the canvas around when held and moving the cursor                                                             |
+| Ctrl/Shift + Click                 | Add clicked node to selection                                                                                      |
+| Ctrl + C/Ctrl + V                  | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes)                     |
+| Ctrl + C/Ctrl + Shift + V          | Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
+| Shift + Drag                       | Move multiple selected nodes at the same time                                                                      |
+| Ctrl + D                           | Load default graph                                                                                                 |
+| Alt + `+`                          | Canvas Zoom in                                                                                                     |
+| Alt + `-`                          | Canvas Zoom out                                                                                                    |
+| Ctrl + Shift + LMB + Vertical drag | Canvas Zoom in/out                                                                                                 |
+| Q                                  | Toggle visibility of the queue                                                                                     |
+| H                                  | Toggle visibility of history                                                                                       |
+| R                                  | Refresh graph                                                                                                      |
 | Double-Click LMB                   | Open node quick search palette                                                                                     |
-| `Shift` + Drag                       | Move multiple wires at once                                                                                        |
-| `Ctrl` + `Alt` + LMB                   | Disconnect all wires from clicked slot                                                                             |

-`Ctrl` can also be replaced with `Cmd` instead for macOS users
+Ctrl can also be replaced with Cmd instead for macOS users

 # Installing

-## Windows Portable
+## Windows

 There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).

@ -149,8 +85,6 @@ Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you

 If you have trouble extracting it, right click the file -> properties -> unblock

-If you have a 50 series Blackwell card like a 5090 or 5080 see [this discussion thread](https://github.com/comfyanonymous/ComfyUI/discussions/6643)
-
 #### How do I share models between another UI and ComfyUI?

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
@ -159,19 +93,8 @@ See the [Config file](extra_model_paths.yaml.example) to set the search paths fo

 To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)

-
-## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)
-
-You can install and start ComfyUI using comfy-cli:
-```bash
-pip install comfy-cli
-comfy install
-```
-
 ## Manual Install (Windows, Linux)

-python 3.13 is supported but using 3.12 is recommended because some custom nodes and their dependencies might not support it yet.
-
 Git clone this repo.

 Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
@ -182,45 +105,21 @@ Put your VAE in: models/vae
 ### AMD GPUs (Linux only)
 AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:

-```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4```
+```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0```

-This is the command to install the nightly with ROCm 6.3 which might have some performance improvements:
+This is the command to install the nightly with ROCm 6.0 which might have some performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3```
-
-### Intel GPUs (Windows and Linux)
-
-(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip (currently available in PyTorch nightly builds). More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
-  
-1. To install PyTorch nightly, use the following command:
-
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```
-
-2. Launch ComfyUI by running `python main.py`
-
-
-(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
-
-1. For Intel® Arc™ A-Series Graphics utilizing IPEX, create a conda environment and use the commands below:
-
-```
-conda install libuv
-pip install torch==2.3.1.post0+cxx11.abi torchvision==0.18.1.post0+cxx11.abi torchaudio==2.3.1.post0+cxx11.abi intel-extension-for-pytorch==2.3.110.post0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
-```
-
-For other supported Intel GPUs with IPEX, visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
-
-Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.1```

 ### NVIDIA

 Nvidia users should install stable pytorch using this command:

-```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126```
+```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```

-This is the command to install pytorch nightly instead which supports the new blackwell 50xx series GPUs and might have performance improvements.
+This is the command to install pytorch nightly instead which might have performance improvements:

-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124```

 #### Troubleshooting

@ -240,6 +139,17 @@ After this you should have everything installed and can proceed to running Comfy

 ### Others:

+#### Intel GPUs
+
+Intel GPU support is available for all Intel GPUs supported by Intel's Extension for Pytorch (IPEX) with the support requirements listed in the [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) page. Choose your platform and method of install and follow the instructions. The steps are as follows:
+
+1. Start by installing the drivers or kernel listed or newer in the Installation page of IPEX linked above for Windows and Linux if needed.
+1. Follow the instructions to install [Intel's oneAPI Basekit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) for your platform.
+1. Install the packages for IPEX using the instructions provided in the Installation page for your platform.
+1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux and run ComfyUI normally as described above after everything is installed.
+
+Additional discussion and help can be found [here](https://github.com/comfyanonymous/ComfyUI/discussions/476).
+
 #### Apple Mac silicon

 You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
@ -255,23 +165,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve

 ```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```

-#### Ascend NPUs
-
-For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method:
-
-1. Begin by installing the recommended or newer kernel version for Linux as specified in the Installation page of torch-npu, if necessary.
-2. Proceed with the installation of Ascend Basekit, which includes the driver, firmware, and CANN, following the instructions provided for your specific platform.
-3. Next, install the necessary packages for torch-npu by adhering to the platform-specific instructions on the [Installation](https://ascend.github.io/docs/sources/pytorch/install.html#pytorch) page.
-4. Finally, adhere to the [ComfyUI manual installation](#manual-install-windows-linux) guide for Linux. Once all components are installed, you can run ComfyUI as described earlier.
-
-#### Cambricon MLUs
-
-For models compatible with Cambricon Extension for PyTorch (torch_mlu). Here's a step-by-step guide tailored to your platform and installation method:
-
-1. Install the Cambricon CNToolkit by adhering to the platform-specific instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cntoolkit_3.7.2/cntoolkit_install_3.7.2/index.html)
-2. Next, install the PyTorch(torch_mlu) following the instructions on the [Installation](https://www.cambricon.com/docs/sdk_1.15.0/cambricon_pytorch_1.17.0/user_guide_1.9/index.html)
-3. Launch ComfyUI by running `python main.py`
-
 # Running

 ```python main.py```
@ -284,14 +177,6 @@ For 6700, 6600 and maybe other RDNA2 or older: ```HSA_OVERRIDE_GFX_VERSION=10.3.

 For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 python main.py```

-### AMD ROCm Tips
-
-You can enable experimental memory efficient attention on pytorch 2.5 in ComfyUI on RDNA3 and potentially other AMD GPUs using this command:
-
-```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```
-
-You can also try setting this env variable `PYTORCH_TUNABLEOP_ENABLED=1` which might speed things up at the cost of a very slow initial run.
-
 # Notes

 Only parts of the graph that have an output with all the correct inputs will be executed.
@ -315,7 +200,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the

 Use ```--preview-method auto``` to enable previews.

-The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth, taesdxl_decoder.pth, taesd3_decoder.pth and taef1_decoder.pth](https://github.com/madebyollin/taesd/) and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI and launch it with `--preview-method taesd` to enable high-quality previews.
+The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.

 ## How to use TLS/SSL?
 Generate a self-signed certificate (not appropriate for shared/production use) and key by running the command: `openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -sha256 -days 3650 -nodes -subj "/C=XX/ST=StateName/L=CityName/O=CompanyName/OU=CompanySectionName/CN=CommonNameOrHostname"`
@ -327,55 +212,13 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w

 ## Support and dev channel

-[Discord](https://comfy.org/discord): Try the #help or #feedback channels.
-
 [Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).

 See also: [https://www.comfy.org/](https://www.comfy.org/)

-## Frontend Development
-
-As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
-
-### Reporting Issues and Requesting Features
-
-For any bugs, issues, or feature requests related to the frontend, please use the [ComfyUI Frontend repository](https://github.com/Comfy-Org/ComfyUI_frontend). This will help us manage and address frontend-specific concerns more efficiently.
-
-### Using the Latest Frontend
-
-The new frontend is now the default for ComfyUI. However, please note:
-
-1. The frontend in the main ComfyUI repository is updated fortnightly.
-2. Daily releases are available in the separate frontend repository.
-
-To use the most up-to-date frontend version:
-
-1. For the latest daily release, launch ComfyUI with this command line argument:
-
-   ```
-   --front-end-version Comfy-Org/ComfyUI_frontend@latest
-   ```
-
-2. For a specific version, replace `latest` with the desired version number:
-
-   ```
-   --front-end-version Comfy-Org/ComfyUI_frontend@1.2.2
-   ```
-
-This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
-
-### Accessing the Legacy Frontend
-
-If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
-
-```
--front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
-```
-
-This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
-
 # QA

 ### Which GPU should I buy for this?

 [See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI)
+
--- a/api_server/init.py
+++ b/api_server/init.py
--- a/api_server/routes/init.py
+++ b/api_server/routes/init.py
--- a/api_server/routes/internal/README.md
+++ b/api_server/routes/internal/README.md
@ -1,3 +0,0 @@
-# ComfyUI Internal Routes
-
-All routes under the `/internal` path are designated for **internal use by ComfyUI only**. These routes are not intended for use by external applications may change at any time without notice.
--- a/api_server/routes/internal/init.py
+++ b/api_server/routes/internal/init.py
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@ -1,73 +0,0 @@
-from aiohttp import web
-from typing import Optional
-from folder_paths import folder_names_and_paths, get_directory_by_type
-from api_server.services.terminal_service import TerminalService
-import app.logger
-import os
-
-class InternalRoutes:
-    '''
-    The top level web router for internal routes: /internal/*
-    The endpoints here should NOT be depended upon. It is for ComfyUI frontend use only.
-    Check README.md for more information.
-    '''
-
-    def __init__(self, prompt_server):
-        self.routes: web.RouteTableDef = web.RouteTableDef()
-        self._app: Optional[web.Application] = None
-        self.prompt_server = prompt_server
-        self.terminal_service = TerminalService(prompt_server)
-
-    def setup_routes(self):
-        @self.routes.get('/logs')
-        async def get_logs(request):
-            return web.json_response("".join([(l["t"] + " - " + l["m"]) for l in app.logger.get_logs()]))
-
-        @self.routes.get('/logs/raw')
-        async def get_raw_logs(request):
-            self.terminal_service.update_size()
-            return web.json_response({
-                "entries": list(app.logger.get_logs()),
-                "size": {"cols": self.terminal_service.cols, "rows": self.terminal_service.rows}
-            })
-
-        @self.routes.patch('/logs/subscribe')
-        async def subscribe_logs(request):
-            json_data = await request.json()
-            client_id = json_data["clientId"]
-            enabled = json_data["enabled"]
-            if enabled:
-                self.terminal_service.subscribe(client_id)
-            else:
-                self.terminal_service.unsubscribe(client_id)
-
-            return web.Response(status=200)
-
-
-        @self.routes.get('/folder_paths')
-        async def get_folder_paths(request):
-            response = {}
-            for key in folder_names_and_paths:
-                response[key] = folder_names_and_paths[key][0]
-            return web.json_response(response)
-
-        @self.routes.get('/files/{directory_type}')
-        async def get_files(request: web.Request) -> web.Response:
-            directory_type = request.match_info['directory_type']
-            if directory_type not in ("output", "input", "temp"):
-                return web.json_response({"error": "Invalid directory type"}, status=400)
-
-            directory = get_directory_by_type(directory_type)
-            sorted_files = sorted(
-                (entry for entry in os.scandir(directory) if entry.is_file()),
-                key=lambda entry: -entry.stat().st_mtime
-            )
-            return web.json_response([entry.name for entry in sorted_files], status=200)
-
-
-    def get_app(self):
-        if self._app is None:
-            self._app = web.Application()
-            self.setup_routes()
-            self._app.add_routes(self.routes)
-        return self._app
--- a/api_server/services/init.py
+++ b/api_server/services/init.py
--- a/api_server/services/terminal_service.py
+++ b/api_server/services/terminal_service.py
@ -1,60 +0,0 @@
-from app.logger import on_flush
-import os
-import shutil
-
-
-class TerminalService:
-    def __init__(self, server):
-        self.server = server
-        self.cols = None
-        self.rows = None
-        self.subscriptions = set()
-        on_flush(self.send_messages)
-
-    def get_terminal_size(self):
-        try:
-            size = os.get_terminal_size()
-            return (size.columns, size.lines)
-        except OSError:
-            try:
-                size = shutil.get_terminal_size()
-                return (size.columns, size.lines)
-            except OSError:
-                return (80, 24)  # fallback to 80x24
-
-    def update_size(self):
-        columns, lines = self.get_terminal_size()
-        changed = False
-
-        if columns != self.cols:
-            self.cols = columns
-            changed = True
-
-        if lines != self.rows:
-            self.rows = lines
-            changed = True
-
-        if changed:
-            return {"cols": self.cols, "rows": self.rows}
-
-        return None
-
-    def subscribe(self, client_id):
-        self.subscriptions.add(client_id)
-
-    def unsubscribe(self, client_id):
-        self.subscriptions.discard(client_id)
-
-    def send_messages(self, entries):
-        if not len(entries) or not len(self.subscriptions):
-            return
-
-        new_size = self.update_size()
-
-        for client_id in self.subscriptions.copy(): # prevent: Set changed size during iteration
-            if client_id not in self.server.sockets:
-                # Automatically unsub if the socket has disconnected
-                self.unsubscribe(client_id)
-                continue
-
-            self.server.send_sync("logs", {"entries": entries, "size": new_size}, client_id)
--- a/api_server/utils/file_operations.py
+++ b/api_server/utils/file_operations.py
@ -1,42 +0,0 @@
-import os
-from typing import List, Union, TypedDict, Literal
-from typing_extensions import TypeGuard
-class FileInfo(TypedDict):
-    name: str
-    path: str
-    type: Literal["file"]
-    size: int
-
-class DirectoryInfo(TypedDict):
-    name: str
-    path: str
-    type: Literal["directory"]
-
-FileSystemItem = Union[FileInfo, DirectoryInfo]
-
-def is_file_info(item: FileSystemItem) -> TypeGuard[FileInfo]:
-    return item["type"] == "file"
-
-class FileSystemOperations:
-    @staticmethod
-    def walk_directory(directory: str) -> List[FileSystemItem]:
-        file_list: List[FileSystemItem] = []
-        for root, dirs, files in os.walk(directory):
-            for name in files:
-                file_path = os.path.join(root, name)
-                relative_path = os.path.relpath(file_path, directory)
-                file_list.append({
-                    "name": name,
-                    "path": relative_path,
-                    "type": "file",
-                    "size": os.path.getsize(file_path)
-                })
-            for name in dirs:
-                dir_path = os.path.join(root, name)
-                relative_path = os.path.relpath(dir_path, directory)
-                file_list.append({
-                    "name": name,
-                    "path": relative_path,
-                    "type": "directory"
-                })
-        return file_list
--- a/app/app_settings.py
+++ b/app/app_settings.py
@ -1,7 +1,6 @@
 import os
 import json
 from aiohttp import web
-import logging


 class AppSettings():
@ -9,21 +8,11 @@ class AppSettings():
        self.user_manager = user_manager

    def get_settings(self, request):
-        try:
        file = self.user_manager.get_request_user_filepath(
-                request,
-                "comfy.settings.json"
-            )
-        except KeyError as e:
-            logging.error("User settings not found.")
-            raise web.HTTPUnauthorized() from e
+            request, "comfy.settings.json")
        if os.path.isfile(file):
-            try:
            with open(file) as f:
                return json.load(f)
-            except:
-                logging.error(f"The user settings file is corrupted: {file}")
-                return {}
        else:
            return {}

--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@ -1,134 +0,0 @@
-from __future__ import annotations
-
-import os
-import folder_paths
-import glob
-from aiohttp import web
-import json
-import logging
-from functools import lru_cache
-
-from utils.json_util import merge_json_recursive
-
-
-# Extra locale files to load into main.json
-EXTRA_LOCALE_FILES = [
-    "nodeDefs.json",
-    "commands.json",
-    "settings.json",
-]
-
-
-def safe_load_json_file(file_path: str) -> dict:
-    if not os.path.exists(file_path):
-        return {}
-
-    try:
-        with open(file_path, "r", encoding="utf-8") as f:
-            return json.load(f)
-    except json.JSONDecodeError:
-        logging.error(f"Error loading {file_path}")
-        return {}
-
-
-class CustomNodeManager:
-    @lru_cache(maxsize=1)
-    def build_translations(self):
-        """Load all custom nodes translations during initialization. Translations are
-        expected to be loaded from `locales/` folder.
-
-        The folder structure is expected to be the following:
-        - custom_nodes/
-            - custom_node_1/
-                - locales/
-                    - en/
-                        - main.json
-                        - commands.json
-                        - settings.json
-
-        returned translations are expected to be in the following format:
-        {
-            "en": {
-                "nodeDefs": {...},
-                "commands": {...},
-                "settings": {...},
-                ...{other main.json keys}
-            }
-        }
-        """
-
-        translations = {}
-
-        for folder in folder_paths.get_folder_paths("custom_nodes"):
-            # Sort glob results for deterministic ordering
-            for custom_node_dir in sorted(glob.glob(os.path.join(folder, "*/"))):
-                locales_dir = os.path.join(custom_node_dir, "locales")
-                if not os.path.exists(locales_dir):
-                    continue
-
-                for lang_dir in glob.glob(os.path.join(locales_dir, "*/")):
-                    lang_code = os.path.basename(os.path.dirname(lang_dir))
-
-                    if lang_code not in translations:
-                        translations[lang_code] = {}
-
-                    # Load main.json
-                    main_file = os.path.join(lang_dir, "main.json")
-                    node_translations = safe_load_json_file(main_file)
-
-                    # Load extra locale files
-                    for extra_file in EXTRA_LOCALE_FILES:
-                        extra_file_path = os.path.join(lang_dir, extra_file)
-                        key = extra_file.split(".")[0]
-                        json_data = safe_load_json_file(extra_file_path)
-                        if json_data:
-                            node_translations[key] = json_data
-
-                    if node_translations:
-                        translations[lang_code] = merge_json_recursive(
-                            translations[lang_code], node_translations
-                        )
-
-        return translations
-
-    def add_routes(self, routes, webapp, loadedModules):
-
-        @routes.get("/workflow_templates")
-        async def get_workflow_templates(request):
-            """Returns a web response that contains the map of custom_nodes names and their associated workflow templates. The ones without templates are omitted."""
-            files = [
-                file
-                for folder in folder_paths.get_folder_paths("custom_nodes")
-                for file in glob.glob(
-                    os.path.join(folder, "*/example_workflows/*.json")
-                )
-            ]
-            workflow_templates_dict = (
-                {}
-            )  # custom_nodes folder name -> example workflow names
-            for file in files:
-                custom_nodes_name = os.path.basename(
-                    os.path.dirname(os.path.dirname(file))
-                )
-                workflow_name = os.path.splitext(os.path.basename(file))[0]
-                workflow_templates_dict.setdefault(custom_nodes_name, []).append(
-                    workflow_name
-                )
-            return web.json_response(workflow_templates_dict)
-
-        # Serve workflow templates from custom nodes.
-        for module_name, module_dir in loadedModules:
-            workflows_dir = os.path.join(module_dir, "example_workflows")
-            if os.path.exists(workflows_dir):
-                webapp.add_routes(
-                    [
-                        web.static(
-                            "/api/workflow_templates/" + module_name, workflows_dir
-                        )
-                    ]
-                )
-
-        @routes.get("/i18n")
-        async def get_i18n(request):
-            """Returns translations from all custom nodes' locales folders."""
-            return web.json_response(self.build_translations())
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -3,69 +3,16 @@ import argparse
 import logging
 import os
 import re
-import sys
 import tempfile
 import zipfile
-import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TypedDict, Optional
-from importlib.metadata import version
+from typing import TypedDict

 import requests
 from typing_extensions import NotRequired
-
 from comfy.cli_args import DEFAULT_VERSION_STRING
-import app.logger
-
-# The path to the requirements.txt file
-req_path = Path(__file__).parents[1] / "requirements.txt"
-
-
-def frontend_install_warning_message():
-    """The warning message to display when the frontend version is not up to date."""
-
-    extra = ""
-    if sys.flags.no_user_site:
-        extra = "-s "
-    return f"""
-Please install the updated requirements.txt file by running:
-{sys.executable} {extra}-m pip install -r {req_path}
-
-This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
-
-If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem
-""".strip()
-
-
-def check_frontend_version():
-    """Check if the frontend version is up to date."""
-
-    def parse_version(version: str) -> tuple[int, int, int]:
-        return tuple(map(int, version.split(".")))
-
-    try:
-        frontend_version_str = version("comfyui-frontend-package")
-        frontend_version = parse_version(frontend_version_str)
-        with open(req_path, "r", encoding="utf-8") as f:
-            required_frontend = parse_version(f.readline().split("=")[-1])
-        if frontend_version < required_frontend:
-            app.logger.log_startup_warning(
-                f"""
-________________________________________________________________________
-WARNING WARNING WARNING WARNING WARNING
-
-Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
-
-{frontend_install_warning_message()}
-________________________________________________________________________
-""".strip()
-            )
-        else:
-            logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
-    except Exception as e:
-        logging.error(f"Failed to check frontend version: {e}")


 REQUEST_TIMEOUT = 10  # seconds
@ -162,49 +109,9 @@ def download_release_asset_zip(release: Release, destination_path: str) -> None:


 class FrontendManager:
+    DEFAULT_FRONTEND_PATH = str(Path(__file__).parents[1] / "web")
    CUSTOM_FRONTENDS_ROOT = str(Path(__file__).parents[1] / "web_custom_versions")

-    @classmethod
-    def default_frontend_path(cls) -> str:
-        try:
-            import comfyui_frontend_package
-
-            return str(importlib.resources.files(comfyui_frontend_package) / "static")
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-frontend-package is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-            sys.exit(-1)
-
-    @classmethod
-    def templates_path(cls) -> str:
-        try:
-            import comfyui_workflow_templates
-
-            return str(
-                importlib.resources.files(comfyui_workflow_templates) / "templates"
-            )
-        except ImportError:
-            logging.error(
-                f"""
-********** ERROR ***********
-
-comfyui-workflow-templates is not installed.
-
-{frontend_install_warning_message()}
-
-********** ERROR ***********
-""".strip()
-            )
-
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
@ -225,15 +132,12 @@ comfyui-workflow-templates is not installed.
        return match_result.group(1), match_result.group(2), match_result.group(3)

    @classmethod
-    def init_frontend_unsafe(
-        cls, version_string: str, provider: Optional[FrontEndProvider] = None
-    ) -> str:
+    def init_frontend_unsafe(cls, version_string: str) -> str:
        """
        Initializes the frontend for the specified version.

        Args:
            version_string (str): The version string.
-            provider (FrontEndProvider, optional): The provider to use. Defaults to None.

        Returns:
            str: The path to the initialized frontend.
@ -243,28 +147,10 @@ comfyui-workflow-templates is not installed.
            main error source might be request timeout or invalid URL.
        """
        if version_string == DEFAULT_VERSION_STRING:
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH

        repo_owner, repo_name, version = cls.parse_version_string(version_string)
-
-        if version.startswith("v"):
-            expected_path = str(
-                Path(cls.CUSTOM_FRONTENDS_ROOT)
-                / f"{repo_owner}_{repo_name}"
-                / version.lstrip("v")
-            )
-            if os.path.exists(expected_path):
-                logging.info(
-                    f"Using existing copy of specific frontend version tag: {repo_owner}/{repo_name}@{version}"
-                )
-                return expected_path
-
-        logging.info(
-            f"Initializing frontend: {repo_owner}/{repo_name}@{version}, requesting version details from GitHub..."
-        )
-
-        provider = provider or FrontEndProvider(repo_owner, repo_name)
+        provider = FrontEndProvider(repo_owner, repo_name)
        release = provider.get_release(version)

        semantic_version = release["tag_name"].lstrip("v")
@ -272,7 +158,6 @@ comfyui-workflow-templates is not installed.
            Path(cls.CUSTOM_FRONTENDS_ROOT) / provider.folder_name / semantic_version
        )
        if not os.path.exists(web_root):
-            try:
            os.makedirs(web_root, exist_ok=True)
            logging.info(
                "Downloading frontend(%s) version(%s) to (%s)",
@ -282,11 +167,6 @@ comfyui-workflow-templates is not installed.
            )
            logging.debug(release)
            download_release_asset_zip(release, destination_path=web_root)
-            finally:
-                # Clean up the directory if it is empty, i.e. the download failed
-                if not os.listdir(web_root):
-                    os.rmdir(web_root)
-
        return web_root

    @classmethod
@ -305,5 +185,4 @@ comfyui-workflow-templates is not installed.
        except Exception as e:
            logging.error("Failed to initialize frontend: %s", e)
            logging.info("Falling back to the default frontend.")
-            check_frontend_version()
-            return cls.default_frontend_path()
+            return cls.DEFAULT_FRONTEND_PATH
--- a/app/logger.py
+++ b/app/logger.py
@ -1,98 +0,0 @@
-from collections import deque
-from datetime import datetime
-import io
-import logging
-import sys
-import threading
-
-logs = None
-stdout_interceptor = None
-stderr_interceptor = None
-
-
-class LogInterceptor(io.TextIOWrapper):
-    def __init__(self, stream,  *args, **kwargs):
-        buffer = stream.buffer
-        encoding = stream.encoding
-        super().__init__(buffer, *args, **kwargs, encoding=encoding, line_buffering=stream.line_buffering)
-        self._lock = threading.Lock()
-        self._flush_callbacks = []
-        self._logs_since_flush = []
-
-    def write(self, data):
-        entry = {"t": datetime.now().isoformat(), "m": data}
-        with self._lock:
-            self._logs_since_flush.append(entry)
-
-            # Simple handling for cr to overwrite the last output if it isnt a full line
-            # else logs just get full of progress messages
-            if isinstance(data, str) and data.startswith("\r") and not logs[-1]["m"].endswith("\n"):
-                logs.pop()
-            logs.append(entry)
-        super().write(data)
-
-    def flush(self):
-        super().flush()
-        for cb in self._flush_callbacks:
-            cb(self._logs_since_flush)
-            self._logs_since_flush = []
-
-    def on_flush(self, callback):
-        self._flush_callbacks.append(callback)
-
-
-def get_logs():
-    return logs
-
-
-def on_flush(callback):
-    if stdout_interceptor is not None:
-        stdout_interceptor.on_flush(callback)
-    if stderr_interceptor is not None:
-        stderr_interceptor.on_flush(callback)
-
-def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool = False):
-    global logs
-    if logs:
-        return
-
-    # Override output streams and log to buffer
-    logs = deque(maxlen=capacity)
-
-    global stdout_interceptor
-    global stderr_interceptor
-    stdout_interceptor = sys.stdout = LogInterceptor(sys.stdout)
-    stderr_interceptor = sys.stderr = LogInterceptor(sys.stderr)
-
-    # Setup default global logger
-    logger = logging.getLogger()
-    logger.setLevel(log_level)
-
-    stream_handler = logging.StreamHandler()
-    stream_handler.setFormatter(logging.Formatter("%(message)s"))
-
-    if use_stdout:
-        # Only errors and critical to stderr
-        stream_handler.addFilter(lambda record: not record.levelno < logging.ERROR)
-
-        # Lesser to stdout
-        stdout_handler = logging.StreamHandler(sys.stdout)
-        stdout_handler.setFormatter(logging.Formatter("%(message)s"))
-        stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
-        logger.addHandler(stdout_handler)
-
-    logger.addHandler(stream_handler)
-
-
-STARTUP_WARNINGS = []
-
-
-def log_startup_warning(msg):
-    logging.warning(msg)
-    STARTUP_WARNINGS.append(msg)
-
-
-def print_startup_warnings():
-    for s in STARTUP_WARNINGS:
-        logging.warning(s)
-    STARTUP_WARNINGS.clear()
--- a/app/model_manager.py
+++ b/app/model_manager.py
@ -1,184 +0,0 @@
-from __future__ import annotations
-
-import os
-import base64
-import json
-import time
-import logging
-import folder_paths
-import glob
-import comfy.utils
-from aiohttp import web
-from PIL import Image
-from io import BytesIO
-from folder_paths import map_legacy, filter_files_extensions, filter_files_content_types
-
-
-class ModelFileManager:
-    def __init__(self) -> None:
-        self.cache: dict[str, tuple[list[dict], dict[str, float], float]] = {}
-
-    def get_cache(self, key: str, default=None) -> tuple[list[dict], dict[str, float], float] | None:
-        return self.cache.get(key, default)
-
-    def set_cache(self, key: str, value: tuple[list[dict], dict[str, float], float]):
-        self.cache[key] = value
-
-    def clear_cache(self):
-        self.cache.clear()
-
-    def add_routes(self, routes):
-        # NOTE: This is an experiment to replace `/models`
-        @routes.get("/experiment/models")
-        async def get_model_folders(request):
-            model_types = list(folder_paths.folder_names_and_paths.keys())
-            folder_black_list = ["configs", "custom_nodes"]
-            output_folders: list[dict] = []
-            for folder in model_types:
-                if folder in folder_black_list:
-                    continue
-                output_folders.append({"name": folder, "folders": folder_paths.get_folder_paths(folder)})
-            return web.json_response(output_folders)
-
-        # NOTE: This is an experiment to replace `/models/{folder}`
-        @routes.get("/experiment/models/{folder}")
-        async def get_all_models(request):
-            folder = request.match_info.get("folder", None)
-            if not folder in folder_paths.folder_names_and_paths:
-                return web.Response(status=404)
-            files = self.get_model_file_list(folder)
-            return web.json_response(files)
-
-        @routes.get("/experiment/models/preview/{folder}/{path_index}/{filename:.*}")
-        async def get_model_preview(request):
-            folder_name = request.match_info.get("folder", None)
-            path_index = int(request.match_info.get("path_index", None))
-            filename = request.match_info.get("filename", None)
-
-            if not folder_name in folder_paths.folder_names_and_paths:
-                return web.Response(status=404)
-
-            folders = folder_paths.folder_names_and_paths[folder_name]
-            folder = folders[0][path_index]
-            full_filename = os.path.join(folder, filename)
-
-            previews = self.get_model_previews(full_filename)
-            default_preview = previews[0] if len(previews) > 0 else None
-            if default_preview is None or (isinstance(default_preview, str) and not os.path.isfile(default_preview)):
-                return web.Response(status=404)
-
-            try:
-                with Image.open(default_preview) as img:
-                    img_bytes = BytesIO()
-                    img.save(img_bytes, format="WEBP")
-                    img_bytes.seek(0)
-                    return web.Response(body=img_bytes.getvalue(), content_type="image/webp")
-            except:
-                return web.Response(status=404)
-
-    def get_model_file_list(self, folder_name: str):
-        folder_name = map_legacy(folder_name)
-        folders = folder_paths.folder_names_and_paths[folder_name]
-        output_list: list[dict] = []
-
-        for index, folder in enumerate(folders[0]):
-            if not os.path.isdir(folder):
-                continue
-            out = self.cache_model_file_list_(folder)
-            if out is None:
-                out = self.recursive_search_models_(folder, index)
-                self.set_cache(folder, out)
-            output_list.extend(out[0])
-
-        return output_list
-
-    def cache_model_file_list_(self, folder: str):
-        model_file_list_cache = self.get_cache(folder)
-
-        if model_file_list_cache is None:
-            return None
-        if not os.path.isdir(folder):
-            return None
-        if os.path.getmtime(folder) != model_file_list_cache[1]:
-            return None
-        for x in model_file_list_cache[1]:
-            time_modified = model_file_list_cache[1][x]
-            folder = x
-            if os.path.getmtime(folder) != time_modified:
-                return None
-
-        return model_file_list_cache
-
-    def recursive_search_models_(self, directory: str, pathIndex: int) -> tuple[list[str], dict[str, float], float]:
-        if not os.path.isdir(directory):
-            return [], {}, time.perf_counter()
-
-        excluded_dir_names = [".git"]
-        # TODO use settings
-        include_hidden_files = False
-
-        result: list[str] = []
-        dirs: dict[str, float] = {}
-
-        for dirpath, subdirs, filenames in os.walk(directory, followlinks=True, topdown=True):
-            subdirs[:] = [d for d in subdirs if d not in excluded_dir_names]
-            if not include_hidden_files:
-                subdirs[:] = [d for d in subdirs if not d.startswith(".")]
-                filenames = [f for f in filenames if not f.startswith(".")]
-
-            filenames = filter_files_extensions(filenames, folder_paths.supported_pt_extensions)
-
-            for file_name in filenames:
-                try:
-                    relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory)
-                    result.append(relative_path)
-                except:
-                    logging.warning(f"Warning: Unable to access {file_name}. Skipping this file.")
-                    continue
-
-            for d in subdirs:
-                path: str = os.path.join(dirpath, d)
-                try:
-                    dirs[path] = os.path.getmtime(path)
-                except FileNotFoundError:
-                    logging.warning(f"Warning: Unable to access {path}. Skipping this path.")
-                    continue
-
-        return [{"name": f, "pathIndex": pathIndex} for f in result], dirs, time.perf_counter()
-
-    def get_model_previews(self, filepath: str) -> list[str | BytesIO]:
-        dirname = os.path.dirname(filepath)
-
-        if not os.path.exists(dirname):
-            return []
-
-        basename = os.path.splitext(filepath)[0]
-        match_files = glob.glob(f"{basename}.*", recursive=False)
-        image_files = filter_files_content_types(match_files, "image")
-        safetensors_file = next(filter(lambda x: x.endswith(".safetensors"), match_files), None)
-        safetensors_metadata = {}
-
-        result: list[str | BytesIO] = []
-
-        for filename in image_files:
-            _basename = os.path.splitext(filename)[0]
-            if _basename == basename:
-                result.append(filename)
-            if _basename == f"{basename}.preview":
-                result.append(filename)
-
-        if safetensors_file:
-            safetensors_filepath = os.path.join(dirname, safetensors_file)
-            header = comfy.utils.safetensors_header(safetensors_filepath, max_size=8*1024*1024)
-            if header:
-                safetensors_metadata = json.loads(header)
-        safetensors_images = safetensors_metadata.get("__metadata__", {}).get("ssmd_cover_images", None)
-        if safetensors_images:
-            safetensors_images = json.loads(safetensors_images)
-            for image in safetensors_images:
-                result.append(BytesIO(base64.b64decode(image)))
-
-        return result
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.clear_cache()
--- a/app/user_manager.py
+++ b/app/user_manager.py
@ -1,58 +1,38 @@
-from __future__ import annotations
 import json
 import os
 import re
 import uuid
 import glob
 import shutil
-import logging
 from aiohttp import web
-from urllib import parse
 from comfy.cli_args import args
-import folder_paths
+from folder_paths import user_directory
 from .app_settings import AppSettings
-from typing import TypedDict

 default_user = "default"
-
-
-class FileInfo(TypedDict):
-    path: str
-    size: int
-    modified: int
-
-
-def get_file_info(path: str, relative_to: str) -> FileInfo:
-    return {
-        "path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
-        "size": os.path.getsize(path),
-        "modified": os.path.getmtime(path)
-    }
+users_file = os.path.join(user_directory, "users.json")


 class UserManager():
    def __init__(self):
-        user_directory = folder_paths.get_user_directory()
+        global user_directory

        self.settings = AppSettings(self)
        if not os.path.exists(user_directory):
-            os.makedirs(user_directory, exist_ok=True)
+            os.mkdir(user_directory)
            if not args.multi_user:
-                logging.warning("****** User settings have been changed to be stored on the server instead of browser storage. ******")
-                logging.warning("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
+                print("****** User settings have been changed to be stored on the server instead of browser storage. ******")
+                print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")

        if args.multi_user:
-            if os.path.isfile(self.get_users_file()):
-                with open(self.get_users_file()) as f:
+            if os.path.isfile(users_file):
+                with open(users_file) as f:
                    self.users = json.load(f)
            else:
                self.users = {}
        else:
            self.users = {"default": "default"}

-    def get_users_file(self):
-        return os.path.join(folder_paths.get_user_directory(), "users.json")
-
    def get_request_user_id(self, request):
        user = "default"
        if args.multi_user and "comfy-user" in request.headers:
@ -64,7 +44,7 @@ class UserManager():
        return user

    def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
-        user_directory = folder_paths.get_user_directory()
+        global user_directory

        if type == "userdata":
            root_dir = user_directory
@ -79,10 +59,6 @@ class UserManager():
            return None

        if file is not None:
-            # Check if filename is url encoded
-            if "%" in file:
-                file = parse.unquote(file)
-
            # prevent leaving /{type}/{user}
            path = os.path.abspath(os.path.join(user_root, file))
            if os.path.commonpath((user_root, path)) != user_root:
@ -104,7 +80,8 @@ class UserManager():

        self.users[user_id] = name

-        with open(self.get_users_file(), "w") as f:
+        global users_file
+        with open(users_file, "w") as f:
            json.dump(self.users, f)

        return user_id
@ -135,65 +112,25 @@ class UserManager():

        @routes.get("/userdata")
        async def listuserdata(request):
-            """
-            List user data files in a specified directory.
-
-            This endpoint allows listing files in a user's data directory, with options for recursion,
-            full file information, and path splitting.
-
-            Query Parameters:
-            - dir (required): The directory to list files from.
-            - recurse (optional): If "true", recursively list files in subdirectories.
-            - full_info (optional): If "true", return detailed file information (path, size, modified time).
-            - split (optional): If "true", split file paths into components (only applies when full_info is false).
-
-            Returns:
-            - 400: If 'dir' parameter is missing.
-            - 403: If the requested path is not allowed.
-            - 404: If the requested directory does not exist.
-            - 200: JSON response with the list of files or file information.
-
-            The response format depends on the query parameters:
-            - Default: List of relative file paths.
-            - full_info=true: List of dictionaries with file details.
-            - split=true (and full_info=false): List of lists, each containing path components.
-            """
            directory = request.rel_url.query.get('dir', '')
            if not directory:
-                return web.Response(status=400, text="Directory not provided")
+                return web.Response(status=400)
                
            path = self.get_request_user_filepath(request, directory)
            if not path:
-                return web.Response(status=403, text="Invalid directory")
+                return web.Response(status=403)
            
            if not os.path.exists(path):
-                return web.Response(status=404, text="Directory not found")
+                return web.Response(status=404)
            
            recurse = request.rel_url.query.get('recurse', '').lower() == "true"
-            full_info = request.rel_url.query.get('full_info', '').lower() == "true"
+            results = glob.glob(os.path.join(
+                glob.escape(path), '**/*'), recursive=recurse)
+            results = [os.path.relpath(x, path) for x in results if os.path.isfile(x)]
+            
            split_path = request.rel_url.query.get('split', '').lower() == "true"
-
-            # Use different patterns based on whether we're recursing or not
-            if recurse:
-                pattern = os.path.join(glob.escape(path), '**', '*')
-            else:
-                pattern = os.path.join(glob.escape(path), '*')
-
-            def process_full_path(full_path: str) -> FileInfo | str | list[str]:
-                if full_info:
-                    return get_file_info(full_path, path)
-
-                rel_path = os.path.relpath(full_path, path).replace(os.sep, '/')
            if split_path:
-                    return [rel_path] + rel_path.split('/')
-
-                return rel_path
-
-            results = [
-                process_full_path(full_path)
-                for full_path in glob.glob(pattern, recursive=recurse)
-                if os.path.isfile(full_path)
-            ]
+                results = [[x] + x.split(os.sep) for x in results]

            return web.json_response(results)

@ -221,51 +158,20 @@ class UserManager():

        @routes.post("/userdata/{file}")
        async def post_userdata(request):
-            """
-            Upload or update a user data file.
-
-            This endpoint handles file uploads to a user's data directory, with options for
-            controlling overwrite behavior and response format.
-
-            Query Parameters:
-            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
-            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
-                                  If "false", returns only the relative file path.
-
-            Path Parameters:
-            - file: The target file path (URL encoded if necessary).
-
-            Returns:
-            - 400: If 'file' parameter is missing.
-            - 403: If the requested path is not allowed.
-            - 409: If overwrite=false and the file already exists.
-            - 200: JSON response with either:
-                  - Full file information (if full_info=true)
-                  - Relative file path (if full_info=false)
-
-            The request body should contain the raw file content to be written.
-            """
            path = get_user_data_path(request)
            if not isinstance(path, str):
                return path
            
-            overwrite = request.query.get("overwrite", 'true') != "false"
-            full_info = request.query.get('full_info', 'false').lower() == "true"
-
+            overwrite = request.query["overwrite"] != "false"
            if not overwrite and os.path.exists(path):
-                return web.Response(status=409, text="File already exists")
+                return web.Response(status=409)

            body = await request.read()

            with open(path, "wb") as f:
                f.write(body)
                
-            user_path = self.get_request_user_filepath(request, None)
-            if full_info:
-                resp = get_file_info(path, user_path)
-            else:
-                resp = os.path.relpath(path, user_path)
-
+            resp = os.path.relpath(path, self.get_request_user_filepath(request, None))
            return web.json_response(resp)

        @routes.delete("/userdata/{file}")
@ -280,30 +186,6 @@ class UserManager():

        @routes.post("/userdata/{file}/move/{dest}")
        async def move_userdata(request):
-            """
-            Move or rename a user data file.
-
-            This endpoint handles moving or renaming files within a user's data directory, with options for
-            controlling overwrite behavior and response format.
-
-            Path Parameters:
-            - file: The source file path (URL encoded if necessary)
-            - dest: The destination file path (URL encoded if necessary)
-
-            Query Parameters:
-            - overwrite (optional): If "false", prevents overwriting existing files. Defaults to "true".
-            - full_info (optional): If "true", returns detailed file information (path, size, modified time).
-                                  If "false", returns only the relative file path.
-
-            Returns:
-            - 400: If either 'file' or 'dest' parameter is missing
-            - 403: If either requested path is not allowed
-            - 404: If the source file does not exist
-            - 409: If overwrite=false and the destination file already exists
-            - 200: JSON response with either:
-                  - Full file information (if full_info=true)
-                  - Relative file path (if full_info=false)
-            """
            source = get_user_data_path(request, check_exists=True)
            if not isinstance(source, str):
                return source
@ -312,19 +194,12 @@ class UserManager():
            if not isinstance(source, str):
                return dest
            
-            overwrite = request.query.get("overwrite", 'true') != "false"
-            full_info = request.query.get('full_info', 'false').lower() == "true"
-
+            overwrite = request.query["overwrite"] != "false"
            if not overwrite and os.path.exists(dest):
-                return web.Response(status=409, text="File already exists")
+                return web.Response(status=409)

-            logging.info(f"moving '{source}' -> '{dest}'")
+            print(f"moving '{source}' -> '{dest}'")
            shutil.move(source, dest)
                
-            user_path = self.get_request_user_filepath(request, None)
-            if full_info:
-                resp = get_file_info(dest, user_path)
-            else:
-                resp = os.path.relpath(dest, user_path)
-
+            resp = os.path.relpath(dest, self.get_request_user_filepath(request, None))
            return web.json_response(resp)
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@ -2,9 +2,11 @@
 #and modified

 import torch
+import torch as th
 import torch.nn as nn

 from ..ldm.modules.diffusionmodules.util import (
+    zero_module,
    timestep_embedding,
 )

@ -160,6 +162,7 @@ class ControlNet(nn.Module):
            if isinstance(self.num_classes, int):
                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
                self.label_emb = nn.Linear(1, time_embed_dim)
            elif self.num_classes == "sequential":
                assert adm_in_channels is not None
@ -412,6 +415,7 @@ class ControlNet(nn.Module):
        out_output = []
        out_middle = []

+        hs = []
        if self.num_classes is not None:
            assert y.shape[0] == x.shape[0]
            emb = emb + self.label_emb(y)
--- a/comfy/cldm/dit_embedder.py
+++ b/comfy/cldm/dit_embedder.py
@ -1,120 +0,0 @@
-import math
-from typing import List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-from comfy.ldm.modules.diffusionmodules.mmdit import DismantledBlock, PatchEmbed, VectorEmbedder, TimestepEmbedder, get_2d_sincos_pos_embed_torch
-
-
-class ControlNetEmbedder(nn.Module):
-
-    def __init__(
-        self,
-        img_size: int,
-        patch_size: int,
-        in_chans: int,
-        attention_head_dim: int,
-        num_attention_heads: int,
-        adm_in_channels: int,
-        num_layers: int,
-        main_model_double: int,
-        double_y_emb: bool,
-        device: torch.device,
-        dtype: torch.dtype,
-        pos_embed_max_size: Optional[int] = None,
-        operations = None,
-    ):
-        super().__init__()
-        self.main_model_double = main_model_double
-        self.dtype = dtype
-        self.hidden_size = num_attention_heads * attention_head_dim
-        self.patch_size = patch_size
-        self.x_embedder = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=self.hidden_size,
-            strict_img_size=pos_embed_max_size is None,
-            device=device,
-            dtype=dtype,
-            operations=operations,
-        )
-
-        self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations)
-
-        self.double_y_emb = double_y_emb
-        if self.double_y_emb:
-            self.orig_y_embedder = VectorEmbedder(
-                adm_in_channels, self.hidden_size, dtype, device, operations=operations
-            )
-            self.y_embedder = VectorEmbedder(
-                self.hidden_size, self.hidden_size, dtype, device, operations=operations
-            )
-        else:
-            self.y_embedder = VectorEmbedder(
-                adm_in_channels, self.hidden_size, dtype, device, operations=operations
-            )
-
-        self.transformer_blocks = nn.ModuleList(
-            DismantledBlock(
-                hidden_size=self.hidden_size, num_heads=num_attention_heads, qkv_bias=True,
-                dtype=dtype, device=device, operations=operations
-            )
-            for _ in range(num_layers)
-        )
-
-        # self.use_y_embedder = pooled_projection_dim != self.time_text_embed.text_embedder.linear_1.in_features
-        # TODO double check this logic when 8b
-        self.use_y_embedder = True
-
-        self.controlnet_blocks = nn.ModuleList([])
-        for _ in range(len(self.transformer_blocks)):
-            controlnet_block = operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
-            self.controlnet_blocks.append(controlnet_block)
-
-        self.pos_embed_input = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=self.hidden_size,
-            strict_img_size=False,
-            device=device,
-            dtype=dtype,
-            operations=operations,
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        context: Optional[torch.Tensor] = None,
-        hint = None,
-    ) -> Tuple[Tensor, List[Tensor]]:
-        x_shape = list(x.shape)
-        x = self.x_embedder(x)
-        if not self.double_y_emb:
-            h = (x_shape[-2] + 1) // self.patch_size
-            w = (x_shape[-1] + 1) // self.patch_size
-            x += get_2d_sincos_pos_embed_torch(self.hidden_size, w, h, device=x.device)
-        c = self.t_embedder(timesteps, dtype=x.dtype)
-        if y is not None and self.y_embedder is not None:
-            if self.double_y_emb:
-                y = self.orig_y_embedder(y)
-            y = self.y_embedder(y)
-            c = c + y
-
-        x = x + self.pos_embed_input(hint)
-
-        block_out = ()
-
-        repeat = math.ceil(self.main_model_double / len(self.transformer_blocks))
-        for i in range(len(self.transformer_blocks)):
-            out = self.transformer_blocks[i](x, c)
-            if not self.double_y_emb:
-                x = out
-            block_out += (self.controlnet_blocks[i](out),) * repeat
-
-        return {"output": block_out}
--- a/comfy/cldm/mmdit.py
+++ b/comfy/cldm/mmdit.py
@ -1,12 +1,11 @@
 import torch
-from typing import Optional
+from typing import Dict, Optional
 import comfy.ldm.modules.diffusionmodules.mmdit

 class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
    def __init__(
        self,
        num_blocks = None,
-        control_latent_channels = None,
        dtype = None,
        device = None,
        operations = None,
@ -18,13 +17,10 @@ class ControlNet(comfy.ldm.modules.diffusionmodules.mmdit.MMDiT):
        for _ in range(len(self.joint_blocks)):
            self.controlnet_blocks.append(operations.Linear(self.hidden_size, self.hidden_size, device=device, dtype=dtype))

-        if control_latent_channels is None:
-            control_latent_channels = self.in_channels
-
        self.pos_embed_input = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(
            None,
            self.patch_size,
-            control_latent_channels,
+            self.in_channels,
            self.hidden_size,
            bias=True,
            strict_img_size=False,
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -1,6 +1,7 @@
 import argparse
 import enum
 import os
+from typing import Optional
 import comfy.options


@ -35,18 +36,17 @@ class EnumAction(argparse.Action):

 parser = argparse.ArgumentParser()

-parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0,::", help="Specify the IP address to listen on (default: 127.0.0.1). You can give a list of ip addresses by separating them with a comma like: 127.2.2.2,127.3.3.3 If --listen is provided without an argument, it defaults to 0.0.0.0,:: (listens on all ipv4 and ipv6)")
+parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
 parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
 parser.add_argument("--tls-keyfile", type=str, help="Path to TLS (SSL) key file. Enables TLS, makes app accessible at https://... requires --tls-certfile to function")
 parser.add_argument("--tls-certfile", type=str, help="Path to TLS (SSL) certificate file. Enables TLS, makes app accessible at https://... requires --tls-keyfile to function")
 parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
 parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")

-parser.add_argument("--base-directory", type=str, default=None, help="Set the ComfyUI base directory for models, custom_nodes, input, output, temp, and user directories.")
 parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
-parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory. Overrides --base-directory.")
-parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory). Overrides --base-directory.")
-parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
+parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
+parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
+parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
 parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
 parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
 parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
@ -60,10 +60,8 @@ fp_group.add_argument("--force-fp32", action="store_true", help="Force fp32 (If
 fp_group.add_argument("--force-fp16", action="store_true", help="Force fp16.")

 fpunet_group = parser.add_mutually_exclusive_group()
-fpunet_group.add_argument("--fp32-unet", action="store_true", help="Run the diffusion model in fp32.")
-fpunet_group.add_argument("--fp64-unet", action="store_true", help="Run the diffusion model in fp64.")
-fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the diffusion model in bf16.")
-fpunet_group.add_argument("--fp16-unet", action="store_true", help="Run the diffusion model in fp16")
+fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the UNET in bf16. This should only be used for testing stuff.")
+fpunet_group.add_argument("--fp16-unet", action="store_true", help="Store unet weights in fp16.")
 fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
 fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")

@ -79,14 +77,12 @@ fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Stor
 fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
 fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
-fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

-parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
-parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")

 class LatentPreviewMethod(enum.Enum):
    NoPreviews = "none"
@ -96,19 +92,10 @@ class LatentPreviewMethod(enum.Enum):

 parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)

-parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
-
-cache_group = parser.add_mutually_exclusive_group()
-cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
-cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
-cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
 attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
 attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
-attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
-attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")

 parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")

@ -125,21 +112,11 @@ vram_group.add_argument("--lowvram", action="store_true", help="Split the unet i
 vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
 vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")

-parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
-
-
 parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha1', 'sha256', 'sha512'], default='sha256', help="Allows you to choose the hash function to use for duplicate filename / contents comparison. Default is sha256.")

 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")

-class PerformanceFeature(enum.Enum):
-    Fp16Accumulation = "fp16_accumulation"
-    Fp8MatrixMultiplication = "fp8_matrix_mult"
-    CublasOps = "cublas_ops"
-
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
-
 parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
 parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
 parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).")
@ -149,8 +126,7 @@ parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Dis

 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")

-parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
-parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
+parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.")

 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
@ -169,14 +145,13 @@ parser.add_argument(
    """,
 )

-def is_valid_directory(path: str) -> str:
-    """Validate if the given path is a directory, and check permissions."""
-    if not os.path.exists(path):
-        raise argparse.ArgumentTypeError(f"The path '{path}' does not exist.")
+def is_valid_directory(path: Optional[str]) -> Optional[str]:
+    """Validate if the given path is a directory."""
+    if path is None:
+        return None
+
    if not os.path.isdir(path):
-        raise argparse.ArgumentTypeError(f"'{path}' is not a directory.")
-    if not os.access(path, os.R_OK):
-        raise argparse.ArgumentTypeError(f"You do not have read permissions for '{path}'.")
+        raise argparse.ArgumentTypeError(f"{path} is not a valid directory.")
    return path

 parser.add_argument(
@ -186,10 +161,6 @@ parser.add_argument(
    help="The local filesystem path to the directory where the frontend is located. Overrides --front-end-version.",
 )

-parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
-
-parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
-
 if comfy.options.args_parsing:
    args = parser.parse_args()
 else:
@ -201,16 +172,9 @@ if args.windows_standalone_build:
 if args.disable_auto_launch:
    args.auto_launch = False

-if args.force_fp16:
-    args.fp16_unet = True
+import logging
+logging_level = logging.INFO
+if args.verbose:
+    logging_level = logging.DEBUG

-
-# '--fast' is not provided, use an empty set
-if args.fast is None:
-    args.fast = set()
-# '--fast' is provided with an empty list, enable all optimizations
-elif args.fast == []:
-    args.fast = set(PerformanceFeature)
-# '--fast' is provided with a list of performance features, use that list
-else:
-    args.fast = set(args.fast)
+logging.basicConfig(format="%(message)s", level=logging_level)
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -23,7 +23,6 @@ class CLIPAttention(torch.nn.Module):

 ACTIVATIONS = {"quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
               "gelu": torch.nn.functional.gelu,
-               "gelu_pytorch_tanh": lambda a: torch.nn.functional.gelu(a, approximate="tanh"),
 }

 class CLIPMLP(torch.nn.Module):
@ -89,27 +88,21 @@ class CLIPTextModel_(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
-        num_positions = config_dict["max_position_embeddings"]
        self.eos_token_id = config_dict["eos_token_id"]

        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, num_positions=num_positions, dtype=dtype, device=device, operations=operations)
+        self.embeddings = CLIPEmbeddings(embed_dim, dtype=dtype, device=device, operations=operations)
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)

-    def forward(self, input_tokens=None, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
-        if embeds is not None:
-            x = embeds + comfy.ops.cast_to(self.embeddings.position_embedding.weight, dtype=dtype, device=embeds.device)
-        else:
+    def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=torch.float32):
        x = self.embeddings(input_tokens, dtype=dtype)
-
        mask = None
        if attention_mask is not None:
            mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
-            mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
-
-        causal_mask = torch.full((x.shape[1], x.shape[1]), -torch.finfo(x.dtype).max, dtype=x.dtype, device=x.device).triu_(1)
+            mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))

+        causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
        if mask is not None:
            mask += causal_mask
        else:
@ -120,9 +113,6 @@ class CLIPTextModel_(torch.nn.Module):
        if i is not None and final_layer_norm_intermediate:
            i = self.final_layer_norm(i)

-        if num_tokens is not None:
-            pooled_output = x[list(range(x.shape[0])), list(map(lambda a: a - 1, num_tokens))]
-        else:
        pooled_output = x[torch.arange(x.shape[0], device=x.device), (torch.round(input_tokens).to(dtype=torch.int, device=x.device) == self.eos_token_id).int().argmax(dim=-1),]
        return x, i, pooled_output

@ -133,6 +123,7 @@ class CLIPTextModel(torch.nn.Module):
        self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
        embed_dim = config_dict["hidden_size"]
        self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
+        self.text_projection.weight.copy_(torch.eye(embed_dim))
        self.dtype = dtype

    def get_input_embeddings(self):
@ -148,35 +139,27 @@ class CLIPTextModel(torch.nn.Module):


 class CLIPVisionEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", dtype=None, device=None, operations=None):
+    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None):
        super().__init__()
-
-        num_patches = (image_size // patch_size) ** 2
-        if model_type == "siglip_vision_model":
-            self.class_embedding = None
-            patch_bias = True
-        else:
-            num_patches = num_patches + 1
        self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
-            patch_bias = False

        self.patch_embedding = operations.Conv2d(
            in_channels=num_channels,
            out_channels=embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
-            bias=patch_bias,
+            bias=False,
            dtype=dtype,
            device=device
        )

-        self.position_embedding = operations.Embedding(num_patches, embed_dim, dtype=dtype, device=device)
+        num_patches = (image_size // patch_size) ** 2
+        num_positions = num_patches + 1
+        self.position_embedding = operations.Embedding(num_positions, embed_dim, dtype=dtype, device=device)

    def forward(self, pixel_values):
        embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
-        if self.class_embedding is not None:
-            embeds = torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1)
-        return embeds + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)
+        return torch.cat([comfy.ops.cast_to_input(self.class_embedding, embeds).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + comfy.ops.cast_to_input(self.position_embedding.weight, embeds)


 class CLIPVision(torch.nn.Module):
@ -187,15 +170,9 @@ class CLIPVision(torch.nn.Module):
        heads = config_dict["num_attention_heads"]
        intermediate_size = config_dict["intermediate_size"]
        intermediate_activation = config_dict["hidden_act"]
-        model_type = config_dict["model_type"]

-        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
-        if model_type == "siglip_vision_model":
-            self.pre_layrnorm = lambda a: a
-            self.output_layernorm = True
-        else:
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=dtype, device=device, operations=operations)
        self.pre_layrnorm = operations.LayerNorm(embed_dim)
-            self.output_layernorm = False
        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
        self.post_layernorm = operations.LayerNorm(embed_dim)

@ -204,41 +181,16 @@ class CLIPVision(torch.nn.Module):
        x = self.pre_layrnorm(x)
        #TODO: attention_mask?
        x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
-        if self.output_layernorm:
-            x = self.post_layernorm(x)
-            pooled_output = x
-        else:
        pooled_output = self.post_layernorm(x[:, 0, :])
        return x, i, pooled_output

-class LlavaProjector(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, dtype, device, operations):
-        super().__init__()
-        self.linear_1 = operations.Linear(in_dim, out_dim, bias=True, device=device, dtype=dtype)
-        self.linear_2 = operations.Linear(out_dim, out_dim, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        return self.linear_2(torch.nn.functional.gelu(self.linear_1(x[:, 1:])))
-
 class CLIPVisionModelProjection(torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        self.vision_model = CLIPVision(config_dict, dtype, device, operations)
-        if "projection_dim" in config_dict:
        self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
-        else:
-            self.visual_projection = lambda a: a
-
-        if "llava3" == config_dict.get("projector_type", None):
-            self.multi_modal_projector = LlavaProjector(config_dict["hidden_size"], 4096, dtype, device, operations)
-        else:
-            self.multi_modal_projector = None

    def forward(self, *args, **kwargs):
        x = self.vision_model(*args, **kwargs)
        out = self.visual_projection(x[2])
-        projected = None
-        if self.multi_modal_projector is not None:
-            projected = self.multi_modal_projector(x[1])
-
-        return (x[0], x[1], out, projected)
+        return (x[0], x[1], out)
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -9,7 +9,6 @@ import comfy.model_patcher
 import comfy.model_management
 import comfy.utils
 import comfy.clip_model
-import comfy.image_encoders.dino2

 class Output:
    def __getitem__(self, key):
@ -17,43 +16,29 @@ class Output:
    def __setitem__(self, key, item):
        setattr(self, key, item)

-def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
-    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
-    std = torch.tensor(std, device=image.device, dtype=image.dtype)
+def clip_preprocess(image, size=224):
+    mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype)
+    std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype)
    image = image.movedim(-1, 1)
    if not (image.shape[2] == size and image.shape[3] == size):
-        if crop:
        scale = (size / min(image.shape[2], image.shape[3]))
-            scale_size = (round(scale * image.shape[2]), round(scale * image.shape[3]))
-        else:
-            scale_size = (size, size)
-
-        image = torch.nn.functional.interpolate(image, size=scale_size, mode="bicubic", antialias=True)
+        image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True)
        h = (image.shape[2] - size)//2
        w = (image.shape[3] - size)//2
        image = image[:,:,h:h+size,w:w+size]
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])

-IMAGE_ENCODERS = {
-    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
-}
-
 class ClipVisionModel():
    def __init__(self, json_config):
        with open(json_config) as f:
            config = json.load(f)

        self.image_size = config.get("image_size", 224)
-        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
-        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        model_class = IMAGE_ENCODERS.get(config.get("model_type", "clip_vision_model"))
        self.load_device = comfy.model_management.text_encoder_device()
        offload_device = comfy.model_management.text_encoder_offload_device()
        self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
-        self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+        self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
        self.model.eval()

        self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
@ -64,16 +49,15 @@ class ClipVisionModel():
    def get_sd(self):
        return self.model.state_dict()

-    def encode_image(self, image, crop=True):
+    def encode_image(self, image):
        comfy.model_management.load_model_gpu(self.patcher)
-        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        pixel_values = clip_preprocess(image.to(self.load_device), size=self.image_size).float()
        out = self.model(pixel_values=pixel_values, intermediate_output=-2)

        outputs = Output()
        outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
        outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
        outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
-        outputs["mm_projected"] = out[3]
        return outputs

 def convert_to_transformers(sd, prefix):
@ -110,21 +94,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
-        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
-        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            if embed_shape == 729:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-            elif embed_shape == 1024:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
-        elif embed_shape == 577:
-            if "multi_modal_projector.linear_1.bias" in sd:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
-            else:
+        if sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
-    elif "embeddings.patch_embeddings.projection.weight" in sd:
-        json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "image_encoders"), "dino2_giant.json")
    else:
        return None

@ -136,7 +109,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    keys = list(sd.keys())
    for k in keys:
        if k not in u:
-            sd.pop(k)
+            t = sd.pop(k)
+            del t
    return clip

 def load(ckpt_path):
--- a/comfy/clip_vision_config_vitl_336_llava.json
+++ b/comfy/clip_vision_config_vitl_336_llava.json
@ -1,19 +0,0 @@
-{
-  "attention_dropout": 0.0,
-  "dropout": 0.0,
-  "hidden_act": "quick_gelu",
-  "hidden_size": 1024,
-  "image_size": 336,
-  "initializer_factor": 1.0,
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "layer_norm_eps": 1e-5,
-  "model_type": "clip_vision_model",
-  "num_attention_heads": 16,
-  "num_channels": 3,
-  "num_hidden_layers": 24,
-  "patch_size": 14,
-  "projection_dim": 768,
-  "projector_type": "llava3",
-  "torch_dtype": "float32"
-}
--- a/comfy/clip_vision_siglip_384.json
+++ b/comfy/clip_vision_siglip_384.json
@ -1,13 +0,0 @@
-{
-  "num_channels": 3,
-  "hidden_act": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
-  "image_size": 384,
-  "intermediate_size": 4304,
-  "model_type": "siglip_vision_model",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 27,
-  "patch_size": 14,
-  "image_mean": [0.5, 0.5, 0.5],
-  "image_std": [0.5, 0.5, 0.5]
-}
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@ -1,13 +0,0 @@
-{
-  "num_channels": 3,
-  "hidden_act": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
-  "image_size": 512,
-  "intermediate_size": 4304,
-  "model_type": "siglip_vision_model",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 27,
-  "patch_size": 16,
-  "image_mean": [0.5, 0.5, 0.5],
-  "image_std": [0.5, 0.5, 0.5]
-}
--- a/comfy/comfy_types/README.md
+++ b/comfy/comfy_types/README.md
@ -1,43 +0,0 @@
-# Comfy Typing
-## Type hinting for ComfyUI Node development
-
-This module provides type hinting and concrete convenience types for node developers.
-If cloned to the custom_nodes directory of ComfyUI, types can be imported using:
-
-```python
-from comfy.comfy_types import IO, ComfyNodeABC, CheckLazyMixin
-
-class ExampleNode(ComfyNodeABC):
-    @classmethod
-    def INPUT_TYPES(s) -> InputTypeDict:
-        return {"required": {}}
-```
-
-Full example is in [examples/example_nodes.py](examples/example_nodes.py).
-
-# Types
-A few primary types are documented below.  More complete information is available via the docstrings on each type.
-
-## `IO`
-
-A string enum of built-in and a few custom data types.  Includes the following special types and their requisite plumbing:
-
- `ANY`: `"*"`
- `NUMBER`: `"FLOAT,INT"`
- `PRIMITIVE`: `"STRING,FLOAT,INT,BOOLEAN"`
-
-## `ComfyNodeABC`
-
-An abstract base class for nodes, offering type-hinting / autocomplete, and somewhat-alright docstrings.
-
-### Type hinting for `INPUT_TYPES`
-
-![INPUT_TYPES auto-completion in Visual Studio Code](examples/input_types.png)
-
-### `INPUT_TYPES` return dict
-
-![INPUT_TYPES return value type hinting in Visual Studio Code](examples/required_hint.png)
-
-### Options for individual inputs
-
-![INPUT_TYPES return value option auto-completion in Visual Studio Code](examples/input_options.png)
--- a/comfy/comfy_types/examples/example_nodes.py
+++ b/comfy/comfy_types/examples/example_nodes.py
@ -1,28 +0,0 @@
-from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
-from inspect import cleandoc
-
-
-class ExampleNode(ComfyNodeABC):
-    """An example node that just adds 1 to an input integer.
-
-    * Requires a modern IDE to provide any benefit (detail: an IDE configured with analysis paths etc).
-    * This node is intended as an example for developers only.
-    """
-
-    DESCRIPTION = cleandoc(__doc__)
-    CATEGORY = "examples"
-
-    @classmethod
-    def INPUT_TYPES(s) -> InputTypeDict:
-        return {
-            "required": {
-                "input_int": (IO.INT, {"defaultInput": True}),
-            }
-        }
-
-    RETURN_TYPES = (IO.INT,)
-    RETURN_NAMES = ("input_plus_one",)
-    FUNCTION = "execute"
-
-    def execute(self, input_int: int):
-        return (input_int + 1,)
--- a/comfy/comfy_types/examples/input_options.png
+++ b/comfy/comfy_types/examples/input_options.png
--- a/comfy/comfy_types/examples/input_types.png
+++ b/comfy/comfy_types/examples/input_types.png
--- a/comfy/comfy_types/examples/required_hint.png
+++ b/comfy/comfy_types/examples/required_hint.png
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -1,336 +0,0 @@
-"""Comfy-specific type hinting"""
-
-from __future__ import annotations
-from typing import Literal, TypedDict
-from typing_extensions import NotRequired
-from abc import ABC, abstractmethod
-from enum import Enum
-
-
-class StrEnum(str, Enum):
-    """Base class for string enums. Python's StrEnum is not available until 3.11."""
-
-    def __str__(self) -> str:
-        return self.value
-
-
-class IO(StrEnum):
-    """Node input/output data types.
-
-    Includes functionality for ``"*"`` (`ANY`) and ``"MULTI,TYPES"``.
-    """
-
-    STRING = "STRING"
-    IMAGE = "IMAGE"
-    MASK = "MASK"
-    LATENT = "LATENT"
-    BOOLEAN = "BOOLEAN"
-    INT = "INT"
-    FLOAT = "FLOAT"
-    COMBO = "COMBO"
-    CONDITIONING = "CONDITIONING"
-    SAMPLER = "SAMPLER"
-    SIGMAS = "SIGMAS"
-    GUIDER = "GUIDER"
-    NOISE = "NOISE"
-    CLIP = "CLIP"
-    CONTROL_NET = "CONTROL_NET"
-    VAE = "VAE"
-    MODEL = "MODEL"
-    CLIP_VISION = "CLIP_VISION"
-    CLIP_VISION_OUTPUT = "CLIP_VISION_OUTPUT"
-    STYLE_MODEL = "STYLE_MODEL"
-    GLIGEN = "GLIGEN"
-    UPSCALE_MODEL = "UPSCALE_MODEL"
-    AUDIO = "AUDIO"
-    WEBCAM = "WEBCAM"
-    POINT = "POINT"
-    FACE_ANALYSIS = "FACE_ANALYSIS"
-    BBOX = "BBOX"
-    SEGS = "SEGS"
-
-    ANY = "*"
-    """Always matches any type, but at a price.
-
-    Causes some functionality issues (e.g. reroutes, link types), and should be avoided whenever possible.
-    """
-    NUMBER = "FLOAT,INT"
-    """A float or an int - could be either"""
-    PRIMITIVE = "STRING,FLOAT,INT,BOOLEAN"
-    """Could be any of: string, float, int, or bool"""
-
-    def __ne__(self, value: object) -> bool:
-        if self == "*" or value == "*":
-            return False
-        if not isinstance(value, str):
-            return True
-        a = frozenset(self.split(","))
-        b = frozenset(value.split(","))
-        return not (b.issubset(a) or a.issubset(b))
-
-
-class RemoteInputOptions(TypedDict):
-    route: str
-    """The route to the remote source."""
-    refresh_button: bool
-    """Specifies whether to show a refresh button in the UI below the widget."""
-    control_after_refresh: Literal["first", "last"]
-    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
-    timeout: int
-    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
-    max_retries: int
-    """The maximum number of retries before aborting the request."""
-    refresh: int
-    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
-
-
-class MultiSelectOptions(TypedDict):
-    placeholder: NotRequired[str]
-    """The placeholder text to display in the multi-select widget when no items are selected."""
-    chip: NotRequired[bool]
-    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
-
-
-class InputTypeOptions(TypedDict):
-    """Provides type hinting for the return type of the INPUT_TYPES node function.
-
-    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
-    """
-
-    default: NotRequired[bool | str | float | int | list | tuple]
-    """The default value of the widget"""
-    defaultInput: NotRequired[bool]
-    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
-    - defaultInput on required inputs should be dropped.
-    - defaultInput on optional inputs should be replaced with forceInput.
-    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
-    """
-    forceInput: NotRequired[bool]
-    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
-    lazy: NotRequired[bool]
-    """Declares that this input uses lazy evaluation"""
-    rawLink: NotRequired[bool]
-    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
-    tooltip: NotRequired[str]
-    """Tooltip for the input (or widget), shown on pointer hover"""
-    # class InputTypeNumber(InputTypeOptions):
-    # default: float | int
-    min: NotRequired[float]
-    """The minimum value of a number (``FLOAT`` | ``INT``)"""
-    max: NotRequired[float]
-    """The maximum value of a number (``FLOAT`` | ``INT``)"""
-    step: NotRequired[float]
-    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
-    round: NotRequired[float]
-    """Floats are rounded by this value (``FLOAT``)"""
-    # class InputTypeBoolean(InputTypeOptions):
-    # default: bool
-    label_on: NotRequired[str]
-    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: NotRequired[str]
-    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
-    # class InputTypeString(InputTypeOptions):
-    # default: str
-    multiline: NotRequired[bool]
-    """Use a multiline text box (``STRING``)"""
-    placeholder: NotRequired[str]
-    """Placeholder text to display in the UI when empty (``STRING``)"""
-    # Deprecated:
-    # defaultVal: str
-    dynamicPrompts: NotRequired[bool]
-    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
-    # class InputTypeCombo(InputTypeOptions):
-    image_upload: NotRequired[bool]
-    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
-    image_folder: NotRequired[Literal["input", "output", "temp"]]
-    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
-    """
-    remote: NotRequired[RemoteInputOptions]
-    """Specifies the configuration for a remote input.
-    Available after ComfyUI frontend v1.9.7
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
-    control_after_generate: NotRequired[bool]
-    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
-    options: NotRequired[list[str | int | float]]
-    """COMBO type only. Specifies the selectable options for the combo widget.
-    Prefer:
-    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
-    Over:
-    [["Option 1", "Option 2", "Option 3"]]
-    """
-    multi_select: NotRequired[MultiSelectOptions]
-    """COMBO type only. Specifies the configuration for a multi-select widget.
-    Available after ComfyUI frontend v1.13.4
-    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""
-
-
-class HiddenInputTypeDict(TypedDict):
-    """Provides type hinting for the hidden entry of node INPUT_TYPES."""
-
-    node_id: NotRequired[Literal["UNIQUE_ID"]]
-    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    unique_id: NotRequired[Literal["UNIQUE_ID"]]
-    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    prompt: NotRequired[Literal["PROMPT"]]
-    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
-    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
-    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
-    dynprompt: NotRequired[Literal["DYNPROMPT"]]
-    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""
-
-
-class InputTypeDict(TypedDict):
-    """Provides type hinting for node INPUT_TYPES.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
-    """
-
-    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
-    """Describes all inputs that must be connected for the node to execute."""
-    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
-    """Describes inputs which do not need to be connected."""
-    hidden: NotRequired[HiddenInputTypeDict]
-    """Offers advanced functionality and server-client communication.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
-    """
-
-
-class ComfyNodeABC(ABC):
-    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
-    """
-
-    DESCRIPTION: str
-    """Node description, shown as a tooltip when hovering over the node.
-
-    Usage::
-
-        # Explicitly define the description
-        DESCRIPTION = "Example description here."
-
-        # Use the docstring of the node class.
-        DESCRIPTION = cleandoc(__doc__)
-    """
-    CATEGORY: str
-    """The category of the node, as per the "Add Node" menu.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
-    """
-    EXPERIMENTAL: bool
-    """Flags a node as experimental, informing users that it may change or not work as expected."""
-    DEPRECATED: bool
-    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
-
-    @classmethod
-    @abstractmethod
-    def INPUT_TYPES(s) -> InputTypeDict:
-        """Defines node inputs.
-
-        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
-        * The ``optional`` key can be added to describe inputs which do not need to be connected.
-        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
-
-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
-        """
-        return {"required": {}}
-
-    OUTPUT_NODE: bool
-    """Flags this node as an output node, causing any inputs it requires to be executed.
-
-    If a node is not connected to any output nodes, that node will not be executed.  Usage::
-
-        OUTPUT_NODE = True
-
-    From the docs:
-
-    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
-    """
-    INPUT_IS_LIST: bool
-    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
-
-    All inputs of ``type`` will become ``list[type]``, regardless of how many items are passed in.  This also affects ``check_lazy_status``.
-
-    From the docs:
-
-    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
-    """
-    OUTPUT_IS_LIST: tuple[bool]
-    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
-
-    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
-
-    A ``tuple[bool]``, where the items match those in `RETURN_TYPES`::
-
-        RETURN_TYPES = (IO.INT, IO.INT, IO.STRING)
-        OUTPUT_IS_LIST = (True, True, False) # The string output will be handled normally
-
-    From the docs:
-
-    In order to tell Comfy that the list being returned should not be wrapped, but treated as a series of data for sequential processing,
-    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
-    specifying which outputs which should be so treated.
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
-    """
-
-    RETURN_TYPES: tuple[IO]
-    """A tuple representing the outputs of this node.
-
-    Usage::
-
-        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
-    """
-    RETURN_NAMES: tuple[str]
-    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
-    """
-    OUTPUT_TOOLTIPS: tuple[str]
-    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
-    FUNCTION: str
-    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`
-
-    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
-    """
-
-
-class CheckLazyMixin:
-    """Provides a basic check_lazy_status implementation and type hinting for nodes that use lazy inputs."""
-
-    def check_lazy_status(self, **kwargs) -> list[str]:
-        """Returns a list of input names that should be evaluated.
-
-        This basic mixin impl. requires all inputs.
-
-        :kwargs: All node inputs will be included here.  If the input is ``None``, it should be assumed that it has not yet been evaluated.  \
-            When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``.
-
-        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
-        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).
-
-        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
-        """
-
-        need = [name for name in kwargs if kwargs[name] is None]
-        return need
-
-
-class FileLocator(TypedDict):
-    """Provides type hinting for the file location"""
-
-    filename: str
-    """The filename of the file."""
-    subfolder: str
-    """The subfolder of the file."""
-    type: Literal["input", "output", "temp"]
-    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
@ -3,6 +3,9 @@ import math
 import comfy.utils


+def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
+    return abs(a*b) // math.gcd(a, b)
+
 class CONDRegular:
    def __init__(self, cond):
        self.cond = cond
@ -43,7 +46,7 @@ class CONDCrossAttn(CONDRegular):
            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
                return False

-            mult_min = math.lcm(s1[1], s2[1])
+            mult_min = lcm(s1[1], s2[1])
            diff = mult_min // min(s1[1], s2[1])
            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
                return False
@ -54,7 +57,7 @@ class CONDCrossAttn(CONDRegular):
        crossattn_max_len = self.cond.shape[1]
        for x in others:
            c = x.cond
-            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
+            crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
            conds.append(c)

        out = []
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -34,12 +34,6 @@ import comfy.t2i_adapter.adapter
 import comfy.ldm.cascade.controlnet
 import comfy.cldm.mmdit
 import comfy.ldm.hydit.controlnet
-import comfy.ldm.flux.controlnet
-import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING
-if TYPE_CHECKING:
-    from comfy.hooks import HookGroup
-

 def broadcast_image_to(tensor, target_batch_size, batched_number):
    current_batch_size = tensor.shape[0]
@ -64,7 +58,7 @@ class StrengthType(Enum):
    LINEAR_UP = 2

 class ControlBase:
-    def __init__(self):
+    def __init__(self, device=None):
        self.cond_hint_original = None
        self.cond_hint = None
        self.strength = 1.0
@ -76,26 +70,20 @@ class ControlBase:
        self.compression_ratio = 8
        self.upscale_algorithm = 'nearest-exact'
        self.extra_args = {}
+
+        if device is None:
+            device = comfy.model_management.get_torch_device()
+        self.device = device
        self.previous_controlnet = None
        self.extra_conds = []
        self.strength_type = StrengthType.CONSTANT
-        self.concat_mask = False
-        self.extra_concat_orig = []
-        self.extra_concat = None
-        self.extra_hooks: HookGroup = None
-        self.preprocess_image = lambda a: a

-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None):
        self.cond_hint_original = cond_hint
        self.strength = strength
        self.timestep_percent_range = timestep_percent_range
        if self.latent_format is not None:
-            if vae is None:
-                logging.warning("WARNING: no VAE provided to the controlnet apply node when this controlnet requires one.")
            self.vae = vae
-        self.extra_concat_orig = extra_concat.copy()
-        if self.concat_mask and len(self.extra_concat_orig) == 0:
-            self.extra_concat_orig.append(torch.tensor([[[[1.0]]]]))
        return self

    def pre_run(self, model, percent_to_timestep_function):
@ -110,9 +98,9 @@ class ControlBase:
    def cleanup(self):
        if self.previous_controlnet is not None:
            self.previous_controlnet.cleanup()
-
+        if self.cond_hint is not None:
+            del self.cond_hint
            self.cond_hint = None
-        self.extra_concat = None
        self.timestep_range = None

    def get_models(self):
@ -121,14 +109,6 @@ class ControlBase:
            out += self.previous_controlnet.get_models()
        return out

-    def get_extra_hooks(self):
-        out = []
-        if self.extra_hooks is not None:
-            out.append(self.extra_hooks)
-        if self.previous_controlnet is not None:
-            out += self.previous_controlnet.get_extra_hooks()
-        return out
-
    def copy_to(self, c):
        c.cond_hint_original = self.cond_hint_original
        c.strength = self.strength
@ -141,10 +121,6 @@ class ControlBase:
        c.vae = self.vae
        c.extra_conds = self.extra_conds.copy()
        c.strength_type = self.strength_type
-        c.concat_mask = self.concat_mask
-        c.extra_concat_orig = self.extra_concat_orig.copy()
-        c.extra_hooks = self.extra_hooks.clone() if self.extra_hooks else None
-        c.preprocess_image = self.preprocess_image

    def inference_memory_requirements(self, dtype):
        if self.previous_controlnet is not None:
@ -170,7 +146,7 @@ class ControlBase:
                        elif self.strength_type == StrengthType.LINEAR_UP:
                            x *= (self.strength ** float(len(control_output) - i))

-                    if output_dtype is not None and x.dtype != output_dtype:
+                    if x.dtype != output_dtype:
                        x = x.to(output_dtype)

                out[key].append(x)
@ -197,8 +173,8 @@ class ControlBase:


 class ControlNet(ControlBase):
-    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False, preprocess_image=lambda a: a):
-        super().__init__()
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, device=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT):
+        super().__init__(device)
        self.control_model = control_model
        self.load_device = load_device
        if control_model is not None:
@ -211,13 +187,11 @@ class ControlNet(ControlBase):
        self.latent_format = latent_format
        self.extra_conds += extra_conds
        self.strength_type = strength_type
-        self.concat_mask = concat_mask
-        self.preprocess_image = preprocess_image

-    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+    def get_control(self, x_noisy, t, cond, batched_number):
        control_prev = None
        if self.previous_controlnet is not None:
-            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)

        if self.timestep_range is not None:
            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
@ -230,6 +204,7 @@ class ControlNet(ControlBase):
        if self.manual_cast_dtype is not None:
            dtype = self.manual_cast_dtype

+        output_dtype = x_noisy.dtype
        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
            if self.cond_hint is not None:
                del self.cond_hint
@ -237,26 +212,14 @@ class ControlNet(ControlBase):
            compression_ratio = self.compression_ratio
            if self.vae is not None:
                compression_ratio *= self.vae.downscale_ratio
-            else:
-                if self.latent_format is not None:
-                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * compression_ratio, x_noisy.shape[2] * compression_ratio, self.upscale_algorithm, "center")
-            self.cond_hint = self.preprocess_image(self.cond_hint)
            if self.vae is not None:
                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
                self.cond_hint = self.vae.encode(self.cond_hint.movedim(1, -1))
                comfy.model_management.load_models_gpu(loaded_models)
            if self.latent_format is not None:
                self.cond_hint = self.latent_format.process_in(self.cond_hint)
-            if len(self.extra_concat_orig) > 0:
-                to_concat = []
-                for c in self.extra_concat_orig:
-                    c = c.to(self.cond_hint.device)
-                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
-                    to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
-                self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
-
-            self.cond_hint = self.cond_hint.to(device=x_noisy.device, dtype=dtype)
+            self.cond_hint = self.cond_hint.to(device=self.device, dtype=dtype)
        if x_noisy.shape[0] != self.cond_hint.shape[0]:
            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)

@ -271,7 +234,7 @@ class ControlNet(ControlBase):
        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)

        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=context.to(dtype), **extra)
-        return self.control_merge(control, control_prev, output_dtype=None)
+        return self.control_merge(control, control_prev, output_dtype)

    def copy(self):
        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
@ -297,6 +260,7 @@ class ControlLoraOps:
    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
        def __init__(self, in_features: int, out_features: int, bias: bool = True,
                    device=None, dtype=None) -> None:
+            factory_kwargs = {'device': device, 'dtype': dtype}
            super().__init__()
            self.in_features = in_features
            self.out_features = out_features
@ -354,8 +318,8 @@ class ControlLoraOps:


 class ControlLora(ControlNet):
-    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
-        ControlBase.__init__(self)
+    def __init__(self, control_weights, global_average_pooling=False, device=None):
+        ControlBase.__init__(self, device)
        self.control_weights = control_weights
        self.global_average_pooling = global_average_pooling
        self.extra_conds += ["y"]
@ -381,6 +345,7 @@ class ControlLora(ControlNet):
        self.control_model.to(comfy.model_management.get_torch_device())
        diffusion_model = model.diffusion_model
        sd = diffusion_model.state_dict()
+        cm = self.control_model.state_dict()

        for k in sd:
            weight = sd[k]
@ -410,25 +375,21 @@ class ControlLora(ControlNet):
    def inference_memory_requirements(self, dtype):
        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)

-def controlnet_config(sd, model_options={}):
+def controlnet_config(sd):
    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)

-    unet_dtype = model_options.get("dtype", None)
-    if unet_dtype is None:
-        weight_dtype = comfy.utils.weight_dtype(sd)
-
-        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+    supported_inference_dtypes = model_config.supported_inference_dtypes

+    controlnet_config = model_config.unet_config
+    unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
    load_device = comfy.model_management.get_torch_device()
    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+    if manual_cast_dtype is not None:
+        operations = comfy.ops.manual_cast
+    else:
+        operations = comfy.ops.disable_weight_init

-    operations = model_options.get("custom_operations", None)
-    if operations is None:
-        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
-
-    offload_device = comfy.model_management.unet_offload_device()
-    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
+    return model_config, operations, load_device, unet_dtype, manual_cast_dtype

 def controlnet_load_state_dict(control_model, sd):
    missing, unexpected = control_model.load_state_dict(sd, strict=False)
@ -440,108 +401,25 @@ def controlnet_load_state_dict(control_model, sd):
        logging.debug("unexpected controlnet keys: {}".format(unexpected))
    return control_model

-
-def load_controlnet_mmdit(sd, model_options={}):
+def load_controlnet_mmdit(sd):
    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype = controlnet_config(new_sd)
    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
    for k in sd:
        new_sd[k] = sd[k]

-    concat_mask = False
-    control_latent_channels = new_sd.get("pos_embed_input.proj.weight").shape[1]
-    if control_latent_channels == 17: #inpaint controlnet
-        concat_mask = True
-
-    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, operations=operations, device=load_device, dtype=unet_dtype, **model_config.unet_config)
    control_model = controlnet_load_state_dict(control_model, new_sd)

    latent_format = comfy.latent_formats.SD3()
    latent_format.shift_factor = 0 #SD3 controlnet weirdness
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control

+def load_controlnet_hunyuandit(controlnet_data):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype = controlnet_config(controlnet_data)

-class ControlNetSD35(ControlNet):
-    def pre_run(self, model, percent_to_timestep_function):
-        if self.control_model.double_y_emb:
-            missing, unexpected = self.control_model.orig_y_embedder.load_state_dict(model.diffusion_model.y_embedder.state_dict(), strict=False)
-        else:
-            missing, unexpected = self.control_model.x_embedder.load_state_dict(model.diffusion_model.x_embedder.state_dict(), strict=False)
-        super().pre_run(model, percent_to_timestep_function)
-
-    def copy(self):
-        c = ControlNetSD35(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
-        c.control_model = self.control_model
-        c.control_model_wrapped = self.control_model_wrapped
-        self.copy_to(c)
-        return c
-
-def load_controlnet_sd35(sd, model_options={}):
-    control_type = -1
-    if "control_type" in sd:
-        control_type = round(sd.pop("control_type").item())
-
-    # blur_cnet = control_type == 0
-    canny_cnet = control_type == 1
-    depth_cnet = control_type == 2
-
-    new_sd = {}
-    for k in comfy.utils.MMDIT_MAP_BASIC:
-        if k[1] in sd:
-            new_sd[k[0]] = sd.pop(k[1])
-    for k in sd:
-        new_sd[k] = sd[k]
-    sd = new_sd
-
-    y_emb_shape = sd["y_embedder.mlp.0.weight"].shape
-    depth = y_emb_shape[0] // 64
-    hidden_size = 64 * depth
-    num_heads = depth
-    head_dim = hidden_size // num_heads
-    num_blocks = comfy.model_detection.count_blocks(new_sd, 'transformer_blocks.{}.')
-
-    load_device = comfy.model_management.get_torch_device()
-    offload_device = comfy.model_management.unet_offload_device()
-    unet_dtype = comfy.model_management.unet_dtype(model_params=-1)
-
-    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-
-    operations = model_options.get("custom_operations", None)
-    if operations is None:
-        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
-
-    control_model = comfy.cldm.dit_embedder.ControlNetEmbedder(img_size=None,
-                                                               patch_size=2,
-                                                               in_chans=16,
-                                                               num_layers=num_blocks,
-                                                               main_model_double=depth,
-                                                               double_y_emb=y_emb_shape[0] == y_emb_shape[1],
-                                                               attention_head_dim=head_dim,
-                                                               num_attention_heads=num_heads,
-                                                               adm_in_channels=2048,
-                                                               device=offload_device,
-                                                               dtype=unet_dtype,
-                                                               operations=operations)
-
-    control_model = controlnet_load_state_dict(control_model, sd)
-
-    latent_format = comfy.latent_formats.SD3()
-    preprocess_image = lambda a: a
-    if canny_cnet:
-        preprocess_image = lambda a: (a * 255 * 0.5 + 0.5)
-    elif depth_cnet:
-        preprocess_image = lambda a: 1.0 - a
-
-    control = ControlNetSD35(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, preprocess_image=preprocess_image)
-    return control
-
-
-
-def load_controlnet_hunyuandit(controlnet_data, model_options={}):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)
-
-    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
+    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=load_device, dtype=unet_dtype)
    control_model = controlnet_load_state_dict(control_model, controlnet_data)

    latent_format = comfy.latent_formats.SDXL()
@ -549,49 +427,13 @@ def load_controlnet_hunyuandit(controlnet_data, model_options={}):
    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
    return control

-def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
-    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    control_model = controlnet_load_state_dict(control_model, sd)
-    extra_conds = ['y', 'guidance']
-    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
-    return control
-
-def load_controlnet_flux_instantx(sd, model_options={}):
-    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
-    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
-    for k in sd:
-        new_sd[k] = sd[k]
-
-    num_union_modes = 0
-    union_cnet = "controlnet_mode_embedder.weight"
-    if union_cnet in new_sd:
-        num_union_modes = new_sd[union_cnet].shape[0]
-
-    control_latent_channels = new_sd.get("pos_embed_input.weight").shape[1] // 4
-    concat_mask = False
-    if control_latent_channels == 17:
-        concat_mask = True
-
-    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, num_union_modes=num_union_modes, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
-    control_model = controlnet_load_state_dict(control_model, new_sd)
-
-    latent_format = comfy.latent_formats.Flux()
-    extra_conds = ['y', 'guidance']
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
-    return control
-
-def convert_mistoline(sd):
-    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
-
-
-def load_controlnet_state_dict(state_dict, model=None, model_options={}):
-    controlnet_data = state_dict
+def load_controlnet(ckpt_path, model=None):
+    controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
-        return load_controlnet_hunyuandit(controlnet_data, model_options=model_options)
+        return load_controlnet_hunyuandit(controlnet_data)

    if "lora_controlnet" in controlnet_data:
-        return ControlLora(controlnet_data, model_options=model_options)
+        return ControlLora(controlnet_data)

    controlnet_config = None
    supported_inference_dtypes = None
@ -646,18 +488,8 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
        if len(leftover_keys) > 0:
            logging.warning("leftover keys: {}".format(leftover_keys))
        controlnet_data = new_sd
-    elif "controlnet_blocks.0.weight" in controlnet_data:
-        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
-            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
-        elif "pos_embed_input.proj.weight" in controlnet_data:
-            if "transformer_blocks.0.adaLN_modulation.1.bias" in controlnet_data:
-                return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
-            else:
-                return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
-        elif "controlnet_x_embedder.weight" in controlnet_data:
-            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
-    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
-        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
+    elif "controlnet_blocks.0.weight" in controlnet_data: #SD3 diffusers format
+        return load_controlnet_mmdit(controlnet_data)

    pth_key = 'control_model.zero_convs.0.0.weight'
    pth = False
@ -669,35 +501,26 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    elif key in controlnet_data:
        prefix = ""
    else:
-        net = load_t2i_adapter(controlnet_data, model_options=model_options)
+        net = load_t2i_adapter(controlnet_data)
        if net is None:
-            logging.error("error could not detect control model type.")
+            logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
        return net

    if controlnet_config is None:
        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
-        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        supported_inference_dtypes = model_config.supported_inference_dtypes
        controlnet_config = model_config.unet_config

-    unet_dtype = model_options.get("dtype", None)
-    if unet_dtype is None:
-        weight_dtype = comfy.utils.weight_dtype(controlnet_data)
-
-        if supported_inference_dtypes is None:
-            supported_inference_dtypes = [comfy.model_management.unet_dtype()]
-
-        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
-
    load_device = comfy.model_management.get_torch_device()
+    if supported_inference_dtypes is None:
+        unet_dtype = comfy.model_management.unet_dtype()
+    else:
+        unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)

    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
-    operations = model_options.get("custom_operations", None)
-    if operations is None:
-        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype)
-
-    controlnet_config["operations"] = operations
+    if manual_cast_dtype is not None:
+        controlnet_config["operations"] = comfy.ops.manual_cast
    controlnet_config["dtype"] = unet_dtype
-    controlnet_config["device"] = comfy.model_management.unet_offload_device()
    controlnet_config.pop("out_channels")
    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
@ -731,32 +554,22 @@ def load_controlnet_state_dict(state_dict, model=None, model_options={}):
    if len(unexpected) > 0:
        logging.debug("unexpected controlnet keys: {}".format(unexpected))

-    global_average_pooling = model_options.get("global_average_pooling", False)
+    global_average_pooling = False
+    filename = os.path.splitext(ckpt_path)[0]
+    if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
+        global_average_pooling = True
+
    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
    return control

-def load_controlnet(ckpt_path, model=None, model_options={}):
-    if "global_average_pooling" not in model_options:
-        filename = os.path.splitext(ckpt_path)[0]
-        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
-            model_options["global_average_pooling"] = True
-
-    cnet = load_controlnet_state_dict(comfy.utils.load_torch_file(ckpt_path, safe_load=True), model=model, model_options=model_options)
-    if cnet is None:
-        logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
-    return cnet
-
 class T2IAdapter(ControlBase):
    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
-        super().__init__()
+        super().__init__(device)
        self.t2i_model = t2i_model
        self.channels_in = channels_in
        self.control_input = None
        self.compression_ratio = compression_ratio
        self.upscale_algorithm = upscale_algorithm
-        if device is None:
-            device = comfy.model_management.get_torch_device()
-        self.device = device

    def scale_image_to(self, width, height):
        unshuffle_amount = self.t2i_model.unshuffle_amount
@ -764,10 +577,10 @@ class T2IAdapter(ControlBase):
        height = math.ceil(height / unshuffle_amount) * unshuffle_amount
        return width, height

-    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+    def get_control(self, x_noisy, t, cond, batched_number):
        control_prev = None
        if self.previous_controlnet is not None:
-            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)

        if self.timestep_range is not None:
            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
@ -804,7 +617,7 @@ class T2IAdapter(ControlBase):
        self.copy_to(c)
        return c

-def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
+def load_t2i_adapter(t2i_data):
    compression_ratio = 8
    upscale_algorithm = 'nearest-exact'

@ -815,7 +628,7 @@ def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
        for i in range(4):
            for j in range(2):
                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
-            prefix_replace["adapter.body.{}.".format(i, )] = "body.{}.".format(i * 2)
+            prefix_replace["adapter.body.{}.".format(i, j)] = "body.{}.".format(i * 2)
        prefix_replace["adapter."] = ""
        t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
    keys = t2i_data.keys()
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
@ -4,6 +4,105 @@ import logging

 # conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py

+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2 * j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+
+
 # ================#
 # VAE Conversion #
 # ================#
@ -58,23 +157,16 @@ vae_conversion_map_attn = [
 ]


-def reshape_weight_for_sd(w, conv3d=False):
+def reshape_weight_for_sd(w):
    # convert HF linear weights to SD conv2d weights
-    if conv3d:
-        return w.reshape(*w.shape, 1, 1, 1)
-    else:
    return w.reshape(*w.shape, 1, 1)


 def convert_vae_state_dict(vae_state_dict):
    mapping = {k: k for k in vae_state_dict.keys()}
-    conv3d = False
    for k, v in mapping.items():
        for sd_part, hf_part in vae_conversion_map:
            v = v.replace(hf_part, sd_part)
-        if v.endswith(".conv.weight"):
-            if not conv3d and vae_state_dict[k].ndim == 5:
-                conv3d = True
        mapping[k] = v
    for k, v in mapping.items():
        if "attentions" in k:
@ -87,7 +179,7 @@ def convert_vae_state_dict(vae_state_dict):
        for weight_name in weights_to_convert:
            if f"mid.attn_1.{weight_name}.weight" in k:
                logging.debug(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v, conv3d=conv3d)
+                new_state_dict[k] = reshape_weight_for_sd(v)
    return new_state_dict


@ -114,7 +206,6 @@ textenc_pattern = re.compile("|".join(protected.keys()))
 # Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
 code2idx = {"q": 0, "k": 1, "v": 2}

-
 # This function exists because at the time of writing torch.cat can't do fp8 with cuda
 def cat_tensors(tensors):
    x = 0
@ -131,7 +222,6 @@ def cat_tensors(tensors):

    return out

-
 def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
    new_state_dict = {}
    capture_qkv_weight = {}
@ -187,3 +277,5 @@ def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):

 def convert_text_enc_state_dict(text_enc_dict):
    return text_enc_dict
+
+
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
@ -22,7 +22,7 @@ def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_dire
    if text_encoder2_path is not None:
        text_encoder_paths.append(text_encoder2_path)

-    unet = comfy.sd.load_diffusion_model(unet_path)
+    unet = comfy.sd.load_unet(unet_path)

    clip = None
    if output_clip:
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@ -1,10 +1,10 @@
 #code taken from: https://github.com/wl-zhao/UniPC and modified

 import torch
+import torch.nn.functional as F
 import math
-import logging

-from tqdm.auto import trange
+from tqdm.auto import trange, tqdm


 class NoiseScheduleVP:
@ -16,7 +16,7 @@ class NoiseScheduleVP:
            continuous_beta_0=0.1,
            continuous_beta_1=20.,
        ):
-        r"""Create a wrapper class for the forward SDE (VP type).
+        """Create a wrapper class for the forward SDE (VP type).

        ***
        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
@ -475,7 +475,7 @@ class UniPC:
            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)

    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
-        logging.info(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
        ns = self.noise_schedule
        assert order <= len(model_prev_list)

@ -519,6 +519,7 @@ class UniPC:
            A_p = C_inv_p

        if use_corrector:
+            print('using corrector')
            C_inv = torch.linalg.inv(C)
            A_c = C_inv

@ -661,7 +662,7 @@ class UniPC:

            if x_t is None:
                if use_predictor:
-                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
                else:
                    pred_res = 0
                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
@ -669,7 +670,7 @@ class UniPC:
            if use_corrector:
                model_t = self.model_fn(x_t, t)
                if D1s is not None:
-                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
                else:
                    corr_res = 0
                D1_t = (model_t - model_prev_0)
@ -703,6 +704,7 @@ class UniPC:
    ):
        # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
        # t_T = self.noise_schedule.T if t_start is None else t_start
+        device = x.device
        steps = len(timesteps) - 1
        if method == 'multistep':
            assert steps >= order
--- a/comfy/float.py
+++ b/comfy/float.py
@ -1,67 +0,0 @@
-import torch
-
-def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
-    mantissa_scaled = torch.where(
-        normal_mask,
-        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
-        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
-    )
-
-    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
-    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
-
-#Not 100% sure about this
-def manual_stochastic_round_to_float8(x, dtype, generator=None):
-    if dtype == torch.float8_e4m3fn:
-        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
-    elif dtype == torch.float8_e5m2:
-        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
-    else:
-        raise ValueError("Unsupported dtype")
-
-    x = x.half()
-    sign = torch.sign(x)
-    abs_x = x.abs()
-    sign = torch.where(abs_x == 0, 0, sign)
-
-    # Combine exponent calculation and clamping
-    exponent = torch.clamp(
-        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
-        0, 2**EXPONENT_BITS - 1
-    )
-
-    # Combine mantissa calculation and rounding
-    normal_mask = ~(exponent == 0)
-
-    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
-
-    sign *= torch.where(
-        normal_mask,
-        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
-        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
-    )
-
-    inf = torch.finfo(dtype)
-    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
-    return sign
-
-
-
-def stochastic_rounding(value, dtype, seed=0):
-    if dtype == torch.float32:
-        return value.to(dtype=torch.float32)
-    if dtype == torch.float16:
-        return value.to(dtype=torch.float16)
-    if dtype == torch.bfloat16:
-        return value.to(dtype=torch.bfloat16)
-    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
-        generator = torch.Generator(device=value.device)
-        generator.manual_seed(seed)
-        output = torch.empty_like(value, dtype=dtype)
-        num_slices = max(1, (value.numel() / (4096 * 4096)))
-        slice_size = max(1, round(value.shape[0] / num_slices))
-        for i in range(0, value.shape[0], slice_size):
-            output[i:i+slice_size].copy_(manual_stochastic_round_to_float8(value[i:i+slice_size], dtype, generator=generator))
-        return output
-
-    return value.to(dtype=dtype)
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
@ -1,4 +1,3 @@
-import math
 import torch
 from torch import nn
 from .ldm.modules.attention import CrossAttention
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@ -1,785 +0,0 @@
-from __future__ import annotations
-from typing import TYPE_CHECKING, Callable
-import enum
-import math
-import torch
-import numpy as np
-import itertools
-import logging
-
-if TYPE_CHECKING:
-    from comfy.model_patcher import ModelPatcher, PatcherInjection
-    from comfy.model_base import BaseModel
-    from comfy.sd import CLIP
-import comfy.lora
-import comfy.model_management
-import comfy.patcher_extension
-from node_helpers import conditioning_set_values
-
-# #######################################################################################################
-# Hooks explanation
-# -------------------
-# The purpose of hooks is to allow conds to influence sampling without the need for ComfyUI core code to
-# make explicit special cases like it does for ControlNet and GLIGEN.
-#
-# This is necessary for nodes/features that are intended for use with masked or scheduled conds, or those
-# that should run special code when a 'marked' cond is used in sampling.
-# #######################################################################################################
-
-class EnumHookMode(enum.Enum):
-    '''
-    Priority of hook memory optimization vs. speed, mostly related to WeightHooks.
-
-    MinVram: No caching will occur for any operations related to hooks.
-    MaxSpeed: Excess VRAM (and RAM, once VRAM is sufficiently depleted) will be used to cache hook weights when switching hook groups.
-    '''
-    MinVram = "minvram"
-    MaxSpeed = "maxspeed"
-
-class EnumHookType(enum.Enum):
-    '''
-    Hook types, each of which has different expected behavior.
-    '''
-    Weight = "weight"
-    ObjectPatch = "object_patch"
-    AdditionalModels = "add_models"
-    TransformerOptions = "transformer_options"
-    Injections = "add_injections"
-
-class EnumWeightTarget(enum.Enum):
-    Model = "model"
-    Clip = "clip"
-
-class EnumHookScope(enum.Enum):
-    '''
-    Determines if hook should be limited in its influence over sampling.
-
-    AllConditioning: hook will affect all conds used in sampling.
-    HookedOnly: hook will only affect the conds it was attached to.
-    '''
-    AllConditioning = "all_conditioning"
-    HookedOnly = "hooked_only"
-
-
-class _HookRef:
-    pass
-
-
-def default_should_register(hook: Hook, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-    '''Example for how custom_should_register function can look like.'''
-    return True
-
-
-def create_target_dict(target: EnumWeightTarget=None, **kwargs) -> dict[str]:
-    '''Creates base dictionary for use with Hooks' target param.'''
-    d = {}
-    if target is not None:
-        d['target'] = target
-    d.update(kwargs)
-    return d
-
-
-class Hook:
-    def __init__(self, hook_type: EnumHookType=None, hook_ref: _HookRef=None, hook_id: str=None,
-                 hook_keyframe: HookKeyframeGroup=None, hook_scope=EnumHookScope.AllConditioning):
-        self.hook_type = hook_type
-        '''Enum identifying the general class of this hook.'''
-        self.hook_ref = hook_ref if hook_ref else _HookRef()
-        '''Reference shared between hook clones that have the same value. Should NOT be modified.'''
-        self.hook_id = hook_id
-        '''Optional string ID to identify hook; useful if need to consolidate duplicates at registration time.'''
-        self.hook_keyframe = hook_keyframe if hook_keyframe else HookKeyframeGroup()
-        '''Keyframe storage that can be referenced to get strength for current sampling step.'''
-        self.hook_scope = hook_scope
-        '''Scope of where this hook should apply in terms of the conds used in sampling run.'''
-        self.custom_should_register = default_should_register
-        '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
-
-    @property
-    def strength(self):
-        return self.hook_keyframe.strength
-
-    def initialize_timesteps(self, model: BaseModel):
-        self.reset()
-        self.hook_keyframe.initialize_timesteps(model)
-
-    def reset(self):
-        self.hook_keyframe.reset()
-
-    def clone(self):
-        c: Hook = self.__class__()
-        c.hook_type = self.hook_type
-        c.hook_ref = self.hook_ref
-        c.hook_id = self.hook_id
-        c.hook_keyframe = self.hook_keyframe
-        c.hook_scope = self.hook_scope
-        c.custom_should_register = self.custom_should_register
-        return c
-
-    def should_register(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        return self.custom_should_register(self, model, model_options, target_dict, registered)
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        raise NotImplementedError("add_hook_patches should be defined for Hook subclasses")
-
-    def __eq__(self, other: Hook):
-        return self.__class__ == other.__class__ and self.hook_ref == other.hook_ref
-
-    def __hash__(self):
-        return hash(self.hook_ref)
-
-class WeightHook(Hook):
-    '''
-    Hook responsible for tracking weights to be applied to some model/clip.
-
-    Note, value of hook_scope is ignored and is treated as HookedOnly.
-    '''
-    def __init__(self, strength_model=1.0, strength_clip=1.0):
-        super().__init__(hook_type=EnumHookType.Weight, hook_scope=EnumHookScope.HookedOnly)
-        self.weights: dict = None
-        self.weights_clip: dict = None
-        self.need_weight_init = True
-        self._strength_model = strength_model
-        self._strength_clip = strength_clip
-        self.hook_scope = EnumHookScope.HookedOnly # this value does not matter for WeightHooks, just for docs
-
-    @property
-    def strength_model(self):
-        return self._strength_model * self.strength
-
-    @property
-    def strength_clip(self):
-        return self._strength_clip * self.strength
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        if not self.should_register(model, model_options, target_dict, registered):
-            return False
-        weights = None
-
-        target = target_dict.get('target', None)
-        if target == EnumWeightTarget.Clip:
-            strength = self._strength_clip
-        else:
-            strength = self._strength_model
-
-        if self.need_weight_init:
-            key_map = {}
-            if target == EnumWeightTarget.Clip:
-                key_map = comfy.lora.model_lora_keys_clip(model.model, key_map)
-            else:
-                key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
-            weights = comfy.lora.load_lora(self.weights, key_map, log_missing=False)
-        else:
-            if target == EnumWeightTarget.Clip:
-                weights = self.weights_clip
-            else:
-                weights = self.weights
-        model.add_hook_patches(hook=self, patches=weights, strength_patch=strength)
-        registered.add(self)
-        return True
-        # TODO: add logs about any keys that were not applied
-
-    def clone(self):
-        c: WeightHook = super().clone()
-        c.weights = self.weights
-        c.weights_clip = self.weights_clip
-        c.need_weight_init = self.need_weight_init
-        c._strength_model = self._strength_model
-        c._strength_clip = self._strength_clip
-        return c
-
-class ObjectPatchHook(Hook):
-    def __init__(self, object_patches: dict[str]=None,
-                 hook_scope=EnumHookScope.AllConditioning):
-        super().__init__(hook_type=EnumHookType.ObjectPatch)
-        self.object_patches = object_patches
-        self.hook_scope = hook_scope
-
-    def clone(self):
-        c: ObjectPatchHook = super().clone()
-        c.object_patches = self.object_patches
-        return c
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        raise NotImplementedError("ObjectPatchHook is not supported yet in ComfyUI.")
-
-class AdditionalModelsHook(Hook):
-    '''
-    Hook responsible for telling model management any additional models that should be loaded.
-
-    Note, value of hook_scope is ignored and is treated as AllConditioning.
-    '''
-    def __init__(self, models: list[ModelPatcher]=None, key: str=None):
-        super().__init__(hook_type=EnumHookType.AdditionalModels)
-        self.models = models
-        self.key = key
-
-    def clone(self):
-        c: AdditionalModelsHook = super().clone()
-        c.models = self.models.copy() if self.models else self.models
-        c.key = self.key
-        return c
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        if not self.should_register(model, model_options, target_dict, registered):
-            return False
-        registered.add(self)
-        return True
-
-class TransformerOptionsHook(Hook):
-    '''
-    Hook responsible for adding wrappers, callbacks, patches, or anything else related to transformer_options.
-    '''
-    def __init__(self, transformers_dict: dict[str, dict[str, dict[str, list[Callable]]]]=None,
-                 hook_scope=EnumHookScope.AllConditioning):
-        super().__init__(hook_type=EnumHookType.TransformerOptions)
-        self.transformers_dict = transformers_dict
-        self.hook_scope = hook_scope
-        self._skip_adding = False
-        '''Internal value used to avoid double load of transformer_options when hook_scope is AllConditioning.'''
-
-    def clone(self):
-        c: TransformerOptionsHook = super().clone()
-        c.transformers_dict = self.transformers_dict
-        c._skip_adding = self._skip_adding
-        return c
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        if not self.should_register(model, model_options, target_dict, registered):
-            return False
-        # NOTE: to_load_options will be used to manually load patches/wrappers/callbacks from hooks
-        self._skip_adding = False
-        if self.hook_scope == EnumHookScope.AllConditioning:
-            add_model_options = {"transformer_options": self.transformers_dict,
-                                 "to_load_options": self.transformers_dict}
-            # skip_adding if included in AllConditioning to avoid double loading
-            self._skip_adding = True
-        else:
-            add_model_options = {"to_load_options": self.transformers_dict}
-        registered.add(self)
-        comfy.patcher_extension.merge_nested_dicts(model_options, add_model_options, copy_dict1=False)
-        return True
-
-    def on_apply_hooks(self, model: ModelPatcher, transformer_options: dict[str]):
-        if not self._skip_adding:
-            comfy.patcher_extension.merge_nested_dicts(transformer_options, self.transformers_dict, copy_dict1=False)
-
-WrapperHook = TransformerOptionsHook
-'''Only here for backwards compatibility, WrapperHook is identical to TransformerOptionsHook.'''
-
-class InjectionsHook(Hook):
-    def __init__(self, key: str=None, injections: list[PatcherInjection]=None,
-                 hook_scope=EnumHookScope.AllConditioning):
-        super().__init__(hook_type=EnumHookType.Injections)
-        self.key = key
-        self.injections = injections
-        self.hook_scope = hook_scope
-
-    def clone(self):
-        c: InjectionsHook = super().clone()
-        c.key = self.key
-        c.injections = self.injections.copy() if self.injections else self.injections
-        return c
-
-    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
-        raise NotImplementedError("InjectionsHook is not supported yet in ComfyUI.")
-
-class HookGroup:
-    '''
-    Stores groups of hooks, and allows them to be queried by type.
-
-    To prevent breaking their functionality, never modify the underlying self.hooks or self._hook_dict vars directly;
-    always use the provided functions on HookGroup.
-    '''
-    def __init__(self):
-        self.hooks: list[Hook] = []
-        self._hook_dict: dict[EnumHookType, list[Hook]] = {}
-
-    def __len__(self):
-        return len(self.hooks)
-
-    def add(self, hook: Hook):
-        if hook not in self.hooks:
-            self.hooks.append(hook)
-            self._hook_dict.setdefault(hook.hook_type, []).append(hook)
-
-    def remove(self, hook: Hook):
-        if hook in self.hooks:
-            self.hooks.remove(hook)
-            self._hook_dict[hook.hook_type].remove(hook)
-
-    def get_type(self, hook_type: EnumHookType):
-        return self._hook_dict.get(hook_type, [])
-
-    def contains(self, hook: Hook):
-        return hook in self.hooks
-
-    def is_subset_of(self, other: HookGroup):
-        self_hooks = set(self.hooks)
-        other_hooks = set(other.hooks)
-        return self_hooks.issubset(other_hooks)
-
-    def new_with_common_hooks(self, other: HookGroup):
-        c = HookGroup()
-        for hook in self.hooks:
-            if other.contains(hook):
-                c.add(hook.clone())
-        return c
-
-    def clone(self):
-        c = HookGroup()
-        for hook in self.hooks:
-            c.add(hook.clone())
-        return c
-
-    def clone_and_combine(self, other: HookGroup):
-        c = self.clone()
-        if other is not None:
-            for hook in other.hooks:
-                c.add(hook.clone())
-        return c
-
-    def set_keyframes_on_hooks(self, hook_kf: HookKeyframeGroup):
-        if hook_kf is None:
-            hook_kf = HookKeyframeGroup()
-        else:
-            hook_kf = hook_kf.clone()
-        for hook in self.hooks:
-            hook.hook_keyframe = hook_kf
-
-    def get_hooks_for_clip_schedule(self):
-        scheduled_hooks: dict[WeightHook, list[tuple[tuple[float,float], HookKeyframe]]] = {}
-        # only care about WeightHooks, for now
-        for hook in self.get_type(EnumHookType.Weight):
-            hook: WeightHook
-            hook_schedule = []
-            # if no hook keyframes, assign default value
-            if len(hook.hook_keyframe.keyframes) == 0:
-                hook_schedule.append(((0.0, 1.0), None))
-                scheduled_hooks[hook] = hook_schedule
-                continue
-            # find ranges of values
-            prev_keyframe = hook.hook_keyframe.keyframes[0]
-            for keyframe in hook.hook_keyframe.keyframes:
-                if keyframe.start_percent > prev_keyframe.start_percent and not math.isclose(keyframe.strength, prev_keyframe.strength):
-                    hook_schedule.append(((prev_keyframe.start_percent, keyframe.start_percent), prev_keyframe))
-                    prev_keyframe = keyframe
-                elif keyframe.start_percent == prev_keyframe.start_percent:
-                    prev_keyframe = keyframe
-            # create final range, assuming last start_percent was not 1.0
-            if not math.isclose(prev_keyframe.start_percent, 1.0):
-                hook_schedule.append(((prev_keyframe.start_percent, 1.0), prev_keyframe))
-            scheduled_hooks[hook] = hook_schedule
-        # hooks should not have their schedules in a list of tuples
-        all_ranges: list[tuple[float, float]] = []
-        for range_kfs in scheduled_hooks.values():
-            for t_range, keyframe in range_kfs:
-                all_ranges.append(t_range)
-        # turn list of ranges into boundaries
-        boundaries_set = set(itertools.chain.from_iterable(all_ranges))
-        boundaries_set.add(0.0)
-        boundaries = sorted(boundaries_set)
-        real_ranges = [(boundaries[i], boundaries[i + 1]) for i in range(len(boundaries) - 1)]
-        # with real ranges defined, give appropriate hooks w/ keyframes for each range
-        scheduled_keyframes: list[tuple[tuple[float,float], list[tuple[WeightHook, HookKeyframe]]]] = []
-        for t_range in real_ranges:
-            hooks_schedule = []
-            for hook, val in scheduled_hooks.items():
-                keyframe = None
-                # check if is a keyframe that works for the current t_range
-                for stored_range, stored_kf in val:
-                    # if stored start is less than current end, then fits - give it assigned keyframe
-                    if stored_range[0] < t_range[1] and stored_range[1] > t_range[0]:
-                        keyframe = stored_kf
-                        break
-                hooks_schedule.append((hook, keyframe))
-            scheduled_keyframes.append((t_range, hooks_schedule))
-        return scheduled_keyframes
-
-    def reset(self):
-        for hook in self.hooks:
-            hook.reset()
-
-    @staticmethod
-    def combine_all_hooks(hooks_list: list[HookGroup], require_count=0) -> HookGroup:
-        actual: list[HookGroup] = []
-        for group in hooks_list:
-            if group is not None:
-                actual.append(group)
-        if len(actual) < require_count:
-            raise Exception(f"Need at least {require_count} hooks to combine, but only had {len(actual)}.")
-        # if no hooks, then return None
-        if len(actual) == 0:
-            return None
-        # if only 1 hook, just return itself without cloning
-        elif len(actual) == 1:
-            return actual[0]
-        final_hook: HookGroup = None
-        for hook in actual:
-            if final_hook is None:
-                final_hook = hook.clone()
-            else:
-                final_hook = final_hook.clone_and_combine(hook)
-        return final_hook
-
-
-class HookKeyframe:
-    def __init__(self, strength: float, start_percent=0.0, guarantee_steps=1):
-        self.strength = strength
-        # scheduling
-        self.start_percent = float(start_percent)
-        self.start_t = 999999999.9
-        self.guarantee_steps = guarantee_steps
-
-    def get_effective_guarantee_steps(self, max_sigma: torch.Tensor):
-        '''If keyframe starts before current sampling range (max_sigma), treat as 0.'''
-        if self.start_t > max_sigma:
-            return 0
-        return self.guarantee_steps
-
-    def clone(self):
-        c = HookKeyframe(strength=self.strength,
-                         start_percent=self.start_percent, guarantee_steps=self.guarantee_steps)
-        c.start_t = self.start_t
-        return c
-
-class HookKeyframeGroup:
-    def __init__(self):
-        self.keyframes: list[HookKeyframe] = []
-        self._current_keyframe: HookKeyframe = None
-        self._current_used_steps = 0
-        self._current_index = 0
-        self._current_strength = None
-        self._curr_t = -1.
-
-    # properties shadow those of HookWeightsKeyframe
-    @property
-    def strength(self):
-        if self._current_keyframe is not None:
-            return self._current_keyframe.strength
-        return 1.0
-
-    def reset(self):
-        self._current_keyframe = None
-        self._current_used_steps = 0
-        self._current_index = 0
-        self._current_strength = None
-        self.curr_t = -1.
-        self._set_first_as_current()
-
-    def add(self, keyframe: HookKeyframe):
-        # add to end of list, then sort
-        self.keyframes.append(keyframe)
-        self.keyframes = get_sorted_list_via_attr(self.keyframes, "start_percent")
-        self._set_first_as_current()
-
-    def _set_first_as_current(self):
-        if len(self.keyframes) > 0:
-            self._current_keyframe = self.keyframes[0]
-        else:
-            self._current_keyframe = None
-
-    def has_guarantee_steps(self):
-        for kf in self.keyframes:
-            if kf.guarantee_steps > 0:
-                return True
-        return False
-
-    def has_index(self, index: int):
-        return index >= 0 and index < len(self.keyframes)
-
-    def is_empty(self):
-        return len(self.keyframes) == 0
-
-    def clone(self):
-        c = HookKeyframeGroup()
-        for keyframe in self.keyframes:
-            c.keyframes.append(keyframe.clone())
-        c._set_first_as_current()
-        return c
-
-    def initialize_timesteps(self, model: BaseModel):
-        for keyframe in self.keyframes:
-            keyframe.start_t = model.model_sampling.percent_to_sigma(keyframe.start_percent)
-
-    def prepare_current_keyframe(self, curr_t: float, transformer_options: dict[str, torch.Tensor]) -> bool:
-        if self.is_empty():
-            return False
-        if curr_t == self._curr_t:
-            return False
-        max_sigma = torch.max(transformer_options["sample_sigmas"])
-        prev_index = self._current_index
-        prev_strength = self._current_strength
-        # if met guaranteed steps, look for next keyframe in case need to switch
-        if self._current_used_steps >= self._current_keyframe.get_effective_guarantee_steps(max_sigma):
-            # if has next index, loop through and see if need to switch
-            if self.has_index(self._current_index+1):
-                for i in range(self._current_index+1, len(self.keyframes)):
-                    eval_c = self.keyframes[i]
-                    # check if start_t is greater or equal to curr_t
-                    # NOTE: t is in terms of sigmas, not percent, so bigger number = earlier step in sampling
-                    if eval_c.start_t >= curr_t:
-                        self._current_index = i
-                        self._current_strength = eval_c.strength
-                        self._current_keyframe = eval_c
-                        self._current_used_steps = 0
-                        # if guarantee_steps greater than zero, stop searching for other keyframes
-                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
-                            break
-                    # if eval_c is outside the percent range, stop looking further
-                    else: break
-        # update steps current context is used
-        self._current_used_steps += 1
-        # update current timestep this was performed on
-        self._curr_t = curr_t
-        # return True if keyframe changed, False if no change
-        return prev_index != self._current_index and prev_strength != self._current_strength
-
-
-class InterpolationMethod:
-    LINEAR = "linear"
-    EASE_IN = "ease_in"
-    EASE_OUT = "ease_out"
-    EASE_IN_OUT = "ease_in_out"
-
-    _LIST = [LINEAR, EASE_IN, EASE_OUT, EASE_IN_OUT]
-
-    @classmethod
-    def get_weights(cls, num_from: float, num_to: float, length: int, method: str, reverse=False):
-        diff = num_to - num_from
-        if method == cls.LINEAR:
-            weights = torch.linspace(num_from, num_to, length)
-        elif method == cls.EASE_IN:
-            index = torch.linspace(0, 1, length)
-            weights = diff * np.power(index, 2) + num_from
-        elif method == cls.EASE_OUT:
-            index = torch.linspace(0, 1, length)
-            weights = diff * (1 - np.power(1 - index, 2)) + num_from
-        elif method == cls.EASE_IN_OUT:
-            index = torch.linspace(0, 1, length)
-            weights = diff * ((1 - np.cos(index * np.pi)) / 2) + num_from
-        else:
-            raise ValueError(f"Unrecognized interpolation method '{method}'.")
-        if reverse:
-            weights = weights.flip(dims=(0,))
-        return weights
-
-def get_sorted_list_via_attr(objects: list, attr: str) -> list:
-    if not objects:
-        return objects
-    elif len(objects) <= 1:
-        return [x for x in objects]
-    # now that we know we have to sort, do it following these rules:
-    # a) if objects have same value of attribute, maintain their relative order
-    # b) perform sorting of the groups of objects with same attributes
-    unique_attrs = {}
-    for o in objects:
-        val_attr = getattr(o, attr)
-        attr_list: list = unique_attrs.get(val_attr, list())
-        attr_list.append(o)
-        if val_attr not in unique_attrs:
-            unique_attrs[val_attr] = attr_list
-    # now that we have the unique attr values grouped together in relative order, sort them by key
-    sorted_attrs = dict(sorted(unique_attrs.items()))
-    # now flatten out the dict into a list to return
-    sorted_list = []
-    for object_list in sorted_attrs.values():
-        sorted_list.extend(object_list)
-    return sorted_list
-
-def create_transformer_options_from_hooks(model: ModelPatcher, hooks: HookGroup,  transformer_options: dict[str]=None):
-    # if no hooks or is not a ModelPatcher for sampling, return empty dict
-    if hooks is None or model.is_clip:
-        return {}
-    if transformer_options is None:
-        transformer_options = {}
-    for hook in hooks.get_type(EnumHookType.TransformerOptions):
-        hook: TransformerOptionsHook
-        hook.on_apply_hooks(model, transformer_options)
-    return transformer_options
-
-def create_hook_lora(lora: dict[str, torch.Tensor], strength_model: float, strength_clip: float):
-    hook_group = HookGroup()
-    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
-    hook_group.add(hook)
-    hook.weights = lora
-    return hook_group
-
-def create_hook_model_as_lora(weights_model, weights_clip, strength_model: float, strength_clip: float):
-    hook_group = HookGroup()
-    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
-    hook_group.add(hook)
-    patches_model = None
-    patches_clip = None
-    if weights_model is not None:
-        patches_model = {}
-        for key in weights_model:
-            patches_model[key] = ("model_as_lora", (weights_model[key],))
-    if weights_clip is not None:
-        patches_clip = {}
-        for key in weights_clip:
-            patches_clip[key] = ("model_as_lora", (weights_clip[key],))
-    hook.weights = patches_model
-    hook.weights_clip = patches_clip
-    hook.need_weight_init = False
-    return hook_group
-
-def get_patch_weights_from_model(model: ModelPatcher, discard_model_sampling=True):
-    if model is None:
-        return None
-    patches_model: dict[str, torch.Tensor] = model.model.state_dict()
-    if discard_model_sampling:
-        # do not include ANY model_sampling components of the model that should act as a patch
-        for key in list(patches_model.keys()):
-            if key.startswith("model_sampling"):
-                patches_model.pop(key, None)
-    return patches_model
-
-# NOTE: this function shows how to register weight hooks directly on the ModelPatchers
-def load_hook_lora_for_models(model: ModelPatcher, clip: CLIP, lora: dict[str, torch.Tensor],
-                              strength_model: float, strength_clip: float):
-    key_map = {}
-    if model is not None:
-        key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
-    if clip is not None:
-        key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
-
-    hook_group = HookGroup()
-    hook = WeightHook()
-    hook_group.add(hook)
-    loaded: dict[str] = comfy.lora.load_lora(lora, key_map)
-    if model is not None:
-        new_modelpatcher = model.clone()
-        k = new_modelpatcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_model)
-    else:
-        k = ()
-        new_modelpatcher = None
-
-    if clip is not None:
-        new_clip = clip.clone()
-        k1 = new_clip.patcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_clip)
-    else:
-        k1 = ()
-        new_clip = None
-    k = set(k)
-    k1 = set(k1)
-    for x in loaded:
-        if (x not in k) and (x not in k1):
-            logging.warning(f"NOT LOADED {x}")
-    return (new_modelpatcher, new_clip, hook_group)
-
-def _combine_hooks_from_values(c_dict: dict[str, HookGroup], values: dict[str, HookGroup], cache: dict[tuple[HookGroup, HookGroup], HookGroup]):
-    hooks_key = 'hooks'
-    # if hooks only exist in one dict, do what's needed so that it ends up in c_dict
-    if hooks_key not in values:
-        return
-    if hooks_key not in c_dict:
-        hooks_value = values.get(hooks_key, None)
-        if hooks_value is not None:
-            c_dict[hooks_key] = hooks_value
-        return
-    # otherwise, need to combine with minimum duplication via cache
-    hooks_tuple = (c_dict[hooks_key], values[hooks_key])
-    cached_hooks = cache.get(hooks_tuple, None)
-    if cached_hooks is None:
-        new_hooks = hooks_tuple[0].clone_and_combine(hooks_tuple[1])
-        cache[hooks_tuple] = new_hooks
-        c_dict[hooks_key] = new_hooks
-    else:
-        c_dict[hooks_key] = cache[hooks_tuple]
-
-def conditioning_set_values_with_hooks(conditioning, values={}, append_hooks=True,
-                                       cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
-    c = []
-    if cache is None:
-        cache = {}
-    for t in conditioning:
-        n = [t[0], t[1].copy()]
-        for k in values:
-            if append_hooks and k == 'hooks':
-                _combine_hooks_from_values(n[1], values, cache)
-            else:
-                n[1][k] = values[k]
-        c.append(n)
-
-    return c
-
-def set_hooks_for_conditioning(cond, hooks: HookGroup, append_hooks=True, cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
-    if hooks is None:
-        return cond
-    return conditioning_set_values_with_hooks(cond, {'hooks': hooks}, append_hooks=append_hooks, cache=cache)
-
-def set_timesteps_for_conditioning(cond, timestep_range: tuple[float,float]):
-    if timestep_range is None:
-        return cond
-    return conditioning_set_values(cond, {"start_percent": timestep_range[0],
-                                          "end_percent": timestep_range[1]})
-
-def set_mask_for_conditioning(cond, mask: torch.Tensor, set_cond_area: str, strength: float):
-    if mask is None:
-        return cond
-    set_area_to_bounds = False
-    if set_cond_area != 'default':
-        set_area_to_bounds = True
-    if len(mask.shape) < 3:
-        mask = mask.unsqueeze(0)
-    return conditioning_set_values(cond, {'mask': mask,
-                                          'set_area_to_bounds': set_area_to_bounds,
-                                          'mask_strength': strength})
-
-def combine_conditioning(conds: list):
-    combined_conds = []
-    for cond in conds:
-        combined_conds.extend(cond)
-    return combined_conds
-
-def combine_with_new_conds(conds: list, new_conds: list):
-    combined_conds = []
-    for c, new_c in zip(conds, new_conds):
-        combined_conds.append(combine_conditioning([c, new_c]))
-    return combined_conds
-
-def set_conds_props(conds: list, strength: float, set_cond_area: str,
-                   mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
-    final_conds = []
-    cache = {}
-    for c in conds:
-        # first, apply lora_hook to conditioning, if provided
-        c = set_hooks_for_conditioning(c, hooks, append_hooks=append_hooks, cache=cache)
-        # next, apply mask to conditioning
-        c = set_mask_for_conditioning(cond=c, mask=mask, strength=strength, set_cond_area=set_cond_area)
-        # apply timesteps, if present
-        c = set_timesteps_for_conditioning(cond=c, timestep_range=timesteps_range)
-        # finally, apply mask to conditioning and store
-        final_conds.append(c)
-    return final_conds
-
-def set_conds_props_and_combine(conds: list, new_conds: list, strength: float=1.0, set_cond_area: str="default",
-                               mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
-    combined_conds = []
-    cache = {}
-    for c, masked_c in zip(conds, new_conds):
-        # first, apply lora_hook to new conditioning, if provided
-        masked_c = set_hooks_for_conditioning(masked_c, hooks, append_hooks=append_hooks, cache=cache)
-        # next, apply mask to new conditioning, if provided
-        masked_c = set_mask_for_conditioning(cond=masked_c, mask=mask, set_cond_area=set_cond_area, strength=strength)
-        # apply timesteps, if present
-        masked_c = set_timesteps_for_conditioning(cond=masked_c, timestep_range=timesteps_range)
-        # finally, combine with existing conditioning and store
-        combined_conds.append(combine_conditioning([c, masked_c]))
-    return combined_conds
-
-def set_default_conds_and_combine(conds: list, new_conds: list,
-                                   hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
-    combined_conds = []
-    cache = {}
-    for c, new_c in zip(conds, new_conds):
-        # first, apply lora_hook to new conditioning, if provided
-        new_c = set_hooks_for_conditioning(new_c, hooks, append_hooks=append_hooks, cache=cache)
-        # next, add default_cond key to cond so that during sampling, it can be identified
-        new_c = conditioning_set_values(new_c, {'default': True})
-        # apply timesteps, if present
-        new_c = set_timesteps_for_conditioning(cond=new_c, timestep_range=timesteps_range)
-        # finally, combine with existing conditioning and store
-        combined_conds.append(combine_conditioning([c, new_c]))
-    return combined_conds
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@ -1,141 +0,0 @@
-import torch
-from comfy.text_encoders.bert import BertAttention
-import comfy.model_management
-from comfy.ldm.modules.attention import optimized_attention_for_device
-
-
-class Dino2AttentionOutput(torch.nn.Module):
-    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.dense(x)
-
-
-class Dino2AttentionBlock(torch.nn.Module):
-    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
-        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
-
-    def forward(self, x, mask, optimized_attention):
-        return self.output(self.attention(x, mask, optimized_attention))
-
-
-class LayerScale(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
-
-    def forward(self, x):
-        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
-
-
-class SwiGLUFFN(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        in_features = out_features = dim
-        hidden_features = int(dim * 4)
-        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
-
-        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
-        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x = self.weights_in(x)
-        x1, x2 = x.chunk(2, dim=-1)
-        x = torch.nn.functional.silu(x1) * x2
-        return self.weights_out(x)
-
-
-class Dino2Block(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
-        super().__init__()
-        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
-        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
-        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
-        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
-        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, x, optimized_attention):
-        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
-        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
-        return x
-
-
-class Dino2Encoder(torch.nn.Module):
-    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
-        super().__init__()
-        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
-
-    def forward(self, x, intermediate_output=None):
-        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
-
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layer) + intermediate_output
-
-        intermediate = None
-        for i, l in enumerate(self.layer):
-            x = l(x, optimized_attention)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-
-class Dino2PatchEmbeddings(torch.nn.Module):
-    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.projection = operations.Conv2d(
-            in_channels=num_channels,
-            out_channels=dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=True,
-            dtype=dtype,
-            device=device
-        )
-
-    def forward(self, pixel_values):
-        return self.projection(pixel_values).flatten(2).transpose(1, 2)
-
-
-class Dino2Embeddings(torch.nn.Module):
-    def __init__(self, dim, dtype, device, operations):
-        super().__init__()
-        patch_size = 14
-        image_size = 518
-
-        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
-        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
-        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
-        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
-
-    def forward(self, pixel_values):
-        x = self.patch_embeddings(pixel_values)
-        # TODO: mask_token?
-        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
-        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
-        return x
-
-
-class Dinov2Model(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        num_layers = config_dict["num_hidden_layers"]
-        dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        layer_norm_eps = config_dict["layer_norm_eps"]
-
-        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
-        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
-        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
-
-    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
-        x = self.embeddings(pixel_values)
-        x, i = self.encoder(x, intermediate_output=intermediate_output)
-        x = self.layernorm(x)
-        pooled_output = x[:, 0, :]
-        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
@ -1,21 +0,0 @@
-{
-  "attention_probs_dropout_prob": 0.0,
-  "drop_path_rate": 0.0,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 1536,
-  "image_size": 518,
-  "initializer_range": 0.02,
-  "layer_norm_eps": 1e-06,
-  "layerscale_value": 1.0,
-  "mlp_ratio": 4,
-  "model_type": "dinov2",
-  "num_attention_heads": 24,
-  "num_channels": 3,
-  "num_hidden_layers": 40,
-  "patch_size": 14,
-  "qkv_bias": true,
-  "use_swiglu_ffn": true,
-  "image_mean": [0.485, 0.456, 0.406],
-  "image_std": [0.229, 0.224, 0.225]
-}
--- a/comfy/k_diffusion/deis.py
+++ b/comfy/k_diffusion/deis.py
@ -11,6 +11,7 @@ import numpy as np
 # Transfer from the input time (sigma) used in EDM to that (t) used in DEIS.

 def edm2t(edm_steps, epsilon_s=1e-3, sigma_min=0.002, sigma_max=80):
+    vp_sigma = lambda beta_d, beta_min: lambda t: (np.e ** (0.5 * beta_d * (t ** 2) + beta_min * t) - 1) ** 0.5
    vp_sigma_inv = lambda beta_d, beta_min: lambda sigma: ((beta_min ** 2 + 2 * beta_d * (sigma ** 2 + 1).log()).sqrt() - beta_min) / beta_d
    vp_beta_d = 2 * (np.log(torch.tensor(sigma_min).cpu() ** 2 + 1) / epsilon_s - np.log(torch.tensor(sigma_max).cpu() ** 2 + 1)) / (epsilon_s - 1)
    vp_beta_min = np.log(torch.tensor(sigma_max).cpu() ** 2 + 1) - 0.5 * vp_beta_d
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -9,7 +9,6 @@ from tqdm.auto import trange, tqdm
 from . import utils
 from . import deis
 import comfy.model_patcher
-import comfy.model_sampling

 def append_zero(x):
    return torch.cat([x, x.new_zeros([1])])
@ -40,21 +39,10 @@ def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
 def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
    """Constructs a continuous VP noise schedule."""
    t = torch.linspace(1, eps_s, n, device=device)
-    sigmas = torch.sqrt(torch.special.expm1(beta_d * t ** 2 / 2 + beta_min * t))
+    sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
    return append_zero(sigmas)


-def get_sigmas_laplace(n, sigma_min, sigma_max, mu=0., beta=0.5, device='cpu'):
-    """Constructs the noise schedule proposed by Tiankai et al. (2024). """
-    epsilon = 1e-5 # avoid log(0)
-    x = torch.linspace(0, 1, n, device=device)
-    clamp = lambda x: torch.clamp(x, min=sigma_min, max=sigma_max)
-    lmb = mu - beta * torch.sign(0.5-x) * torch.log(1 - 2 * torch.abs(0.5-x) + epsilon)
-    sigmas = clamp(torch.exp(lmb))
-    return sigmas
-
-
-
 def to_d(x, sigma, denoised):
    """Converts a denoiser output to a Karras ODE derivative."""
    return (x - denoised) / utils.append_dims(sigma, x.ndim)
@ -70,14 +58,8 @@ def get_ancestral_step(sigma_from, sigma_to, eta=1.):
    return sigma_down, sigma_up


-def default_noise_sampler(x, seed=None):
-    if seed is not None:
-        generator = torch.Generator(device=x.device)
-        generator.manual_seed(seed)
-    else:
-        generator = None
-
-    return lambda sigma, sigma_next: torch.randn(x.size(), dtype=x.dtype, layout=x.layout, device=x.device, generator=generator)
+def default_noise_sampler(x):
+    return lambda sigma, sigma_next: torch.randn_like(x)


 class BatchedBrownianTree:
@ -170,55 +152,23 @@ def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None,

@torch.no_grad()
 def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
-        return sample_euler_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-
-        if sigma_down == 0:
-            x = denoised
-        else:
        d = to_d(x, sigmas[i], denoised)
        # Euler method
        dt = sigma_down - sigmas[i]
-            x = x + d * dt + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
+        x = x + d * dt
+        if sigmas[i + 1] > 0:
+            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x

-@torch.no_grad()
-def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1.0, s_noise=1., noise_sampler=None):
-    """Ancestral sampling with Euler method steps."""
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            downstep_ratio = 1 + (sigmas[i + 1] / sigmas[i] - 1) * eta
-            sigma_down = sigmas[i + 1] * downstep_ratio
-            alpha_ip1 = 1 - sigmas[i + 1]
-            alpha_down = 1 - sigma_down
-            renoise_coeff = (sigmas[i + 1]**2 - sigma_down**2 * alpha_ip1**2 / alpha_down**2)**0.5
-            # Euler method
-            sigma_down_i_ratio = sigma_down / sigmas[i]
-            x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * denoised
-            if eta > 0:
-                x = (alpha_ip1 / alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
-    return x

@torch.no_grad()
 def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
@ -293,13 +243,9 @@ def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None,

@torch.no_grad()
 def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
-        return sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
-
    """Ancestral sampling with DPM-Solver second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -323,39 +269,6 @@ def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, dis
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
    return x

-@torch.no_grad()
-def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
-        sigma_down = sigmas[i+1] * downstep_ratio
-        alpha_ip1 = 1 - sigmas[i+1]
-        alpha_down = 1 - sigma_down
-        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
-
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        d = to_d(x, sigmas[i], denoised)
-        if sigma_down == 0:
-            # Euler method
-            dt = sigma_down - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver-2
-            sigma_mid = sigmas[i].log().lerp(sigma_down.log(), 0.5).exp()
-            dt_1 = sigma_mid - sigmas[i]
-            dt_2 = sigma_down - sigmas[i]
-            x_2 = x + d * dt_1
-            denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
-            d_2 = to_d(x_2, sigma_mid, denoised_2)
-            x = x + d_2 * dt_2
-            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
-    return x

 def linear_multistep_coeff(order, t, i, j):
    if order - 1 > i:
@ -475,7 +388,7 @@ class DPMSolver(nn.Module):
        return x_3, eps_cache

    def dpm_solver_fast(self, x, t_start, t_end, nfe, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x, seed=self.extra_args.get("seed", None)) if noise_sampler is None else noise_sampler
+        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
        if not t_end > t_start and eta:
            raise ValueError('eta must be 0 for reverse sampling')

@ -514,7 +427,7 @@ class DPMSolver(nn.Module):
        return x

    def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None):
-        noise_sampler = default_noise_sampler(x, seed=self.extra_args.get("seed", None)) if noise_sampler is None else noise_sampler
+        noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
        if order not in {2, 3}:
            raise ValueError('order should be 2 or 3')
        forward = t_end > t_start
@ -596,13 +509,9 @@ def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callbac

@torch.no_grad()
 def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    if isinstance(model.inner_model.inner_model.model_sampling, comfy.model_sampling.CONST):
-        return sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args, callback, disable, eta, s_noise, noise_sampler)
-
    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@ -632,66 +541,16 @@ def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None,
    return x


-@torch.no_grad()
-def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
-    lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
-
-    # logged_x = x.unsqueeze(0)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        downstep_ratio = 1 + (sigmas[i+1]/sigmas[i] - 1) * eta
-        sigma_down = sigmas[i+1] * downstep_ratio
-        alpha_ip1 = 1 - sigmas[i+1]
-        alpha_down = 1 - sigma_down
-        renoise_coeff = (sigmas[i+1]**2 - sigma_down**2*alpha_ip1**2/alpha_down**2)**0.5
-        # sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            # Euler method
-            d = to_d(x, sigmas[i], denoised)
-            dt = sigma_down - sigmas[i]
-            x = x + d * dt
-        else:
-            # DPM-Solver++(2S)
-            if sigmas[i] == 1.0:
-                sigma_s = 0.9999
-            else:
-                t_i, t_down = lambda_fn(sigmas[i]), lambda_fn(sigma_down)
-                r = 1 / 2
-                h = t_down - t_i
-                s = t_i + r * h
-                sigma_s = sigma_fn(s)
-            # sigma_s = sigmas[i+1]
-            sigma_s_i_ratio = sigma_s / sigmas[i]
-            u = sigma_s_i_ratio * x + (1 - sigma_s_i_ratio) * denoised
-            D_i = model(u, sigma_s * s_in, **extra_args)
-            sigma_down_i_ratio = sigma_down / sigmas[i]
-            x = sigma_down_i_ratio * x + (1 - sigma_down_i_ratio) * D_i
-            # print("sigma_i", sigmas[i], "sigma_ip1", sigmas[i+1],"sigma_down", sigma_down, "sigma_down_i_ratio", sigma_down_i_ratio, "sigma_s_i_ratio", sigma_s_i_ratio, "renoise_coeff", renoise_coeff)
-        # Noise addition
-        if sigmas[i + 1] > 0 and eta > 0:
-            x = (alpha_ip1/alpha_down) * x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * renoise_coeff
-        # logged_x = torch.cat((logged_x, x.unsqueeze(0)), dim=0)
-    return x
-
@torch.no_grad()
 def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    """DPM-Solver++ (stochastic)."""
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    seed = extra_args.get("seed", None)
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])
    sigma_fn = lambda t: t.neg().exp()
    t_fn = lambda sigma: sigma.log().neg()
@ -762,10 +621,10 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if solver_type not in {'heun', 'midpoint'}:
        raise ValueError('solver_type must be \'heun\' or \'midpoint\'')

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    old_denoised = None
@ -808,10 +667,10 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
    if len(sigmas) <= 1:
        return x

-    extra_args = {} if extra_args is None else extra_args
    seed = extra_args.get("seed", None)
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
+    extra_args = {} if extra_args is None else extra_args
    s_in = x.new_ones([x.shape[0]])

    denoised_1, denoised_2 = None, None
@ -858,7 +717,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
@ -867,7 +726,7 @@ def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
@ -876,7 +735,7 @@ def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, di
 def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
    if len(sigmas) <= 1:
        return x
-    extra_args = {} if extra_args is None else extra_args
+
    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
    return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
@ -894,8 +753,7 @@ def DDPMSampler_step(x, sigma, sigma_prev, noise, noise_sampler):

 def generic_step_sampler(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, step_function=None):
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])

    for i in trange(len(sigmas) - 1, disable=disable):
@ -915,8 +773,7 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
@torch.no_grad()
 def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
    s_in = x.new_ones([x.shape[0]])
    for i in trange(len(sigmas) - 1, disable=disable):
        denoised = model(x, sigmas[i] * s_in, **extra_args)
@ -1159,6 +1016,7 @@ def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disabl
        d = to_d(x, sigma_hat, temp[0])
        if callback is not None:
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        dt = sigmas[i + 1] - sigma_hat
        # Euler method
        x = denoised + d * sigmas[i + 1]
    return x
@ -1167,8 +1025,7 @@ def sample_euler_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disabl
 def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
    """Ancestral sampling with Euler method steps."""
    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler

    temp = [0]
    def post_cfg_function(args):
@ -1186,337 +1043,8 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
        d = to_d(x, sigmas[i], temp[0])
        # Euler method
-        x = denoised + d * sigma_down
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-    return x
-@torch.no_grad()
-def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    """Ancestral sampling with DPM-Solver++(2S) second-order steps."""
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-
-    temp = [0]
-    def post_cfg_function(args):
-        temp[0] = args["uncond_denoised"]
-        return args["denoised"]
-
-    model_options = extra_args.get("model_options", {}).copy()
-    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
-    t_fn = lambda sigma: sigma.log().neg()
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigma_down == 0:
-            # Euler method
-            d = to_d(x, sigmas[i], temp[0])
-            x = denoised + d * sigma_down
-        else:
-            # DPM-Solver++(2S)
-            t, t_next = t_fn(sigmas[i]), t_fn(sigma_down)
-            # r = torch.sinh(1 + (2 - eta) * (t_next - t) / (t - t_fn(sigma_up))) works only on non-cfgpp, weird
-            r = 1 / 2
-            h = t_next - t
-            s = t + r * h
-            x_2 = (sigma_fn(s) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h * r).expm1() * denoised
-            denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
-            x = (sigma_fn(t_next) / sigma_fn(t)) * (x + (denoised - temp[0])) - (-h).expm1() * denoised_2
-        # Noise addition
-        if sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-    return x
-
-@torch.no_grad()
-def sample_dpmpp_2m_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None):
-    """DPM-Solver++(2M)."""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    t_fn = lambda sigma: sigma.log().neg()
-
-    old_uncond_denoised = None
-    uncond_denoised = None
-    def post_cfg_function(args):
-        nonlocal uncond_denoised
-        uncond_denoised = args["uncond_denoised"]
-        return args["denoised"]
-
-    model_options = extra_args.get("model_options", {}).copy()
-    extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
-        h = t_next - t
-        if old_uncond_denoised is None or sigmas[i + 1] == 0:
-            denoised_mix = -torch.exp(-h) * uncond_denoised
-        else:
-            h_last = t - t_fn(sigmas[i - 1])
-            r = h_last / h
-            denoised_mix = -torch.exp(-h) * uncond_denoised - torch.expm1(-h) * (1 / (2 * r)) * (denoised - old_uncond_denoised)
-        x = denoised + denoised_mix + torch.exp(-h) * x
-        old_uncond_denoised = uncond_denoised
-    return x
-
-@torch.no_grad()
-def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, eta=1., cfg_pp=False):
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-    sigma_fn = lambda t: t.neg().exp()
-    t_fn = lambda sigma: sigma.log().neg()
-    phi1_fn = lambda t: torch.expm1(t) / t
-    phi2_fn = lambda t: (phi1_fn(t) - 1.0) / t
-
-    old_denoised = None
-    uncond_denoised = None
-    def post_cfg_function(args):
-        nonlocal uncond_denoised
-        uncond_denoised = args["uncond_denoised"]
-        return args["denoised"]
-
-    if cfg_pp:
-        model_options = extra_args.get("model_options", {}).copy()
-        extra_args["model_options"] = comfy.model_patcher.set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=True)
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
-        if callback is not None:
-            callback({"x": x, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
-        if sigma_down == 0 or old_denoised is None:
-            # Euler method
-            if cfg_pp:
-                d = to_d(x, sigmas[i], uncond_denoised)
-                x = denoised + d * sigma_down
-            else:
-                d = to_d(x, sigmas[i], denoised)
        dt = sigma_down - sigmas[i]
-                x = x + d * dt
-        else:
-            # Second order multistep method in https://arxiv.org/pdf/2308.02157
-            t, t_next, t_prev = t_fn(sigmas[i]), t_fn(sigma_down), t_fn(sigmas[i - 1])
-            h = t_next - t
-            c2 = (t_prev - t) / h
-
-            phi1_val, phi2_val = phi1_fn(-h), phi2_fn(-h)
-            b1 = torch.nan_to_num(phi1_val - phi2_val / c2, nan=0.0)
-            b2 = torch.nan_to_num(phi2_val / c2, nan=0.0)
-
-            if cfg_pp:
-                x = x + (denoised - uncond_denoised)
-                x = sigma_fn(h) * x + h * (b1 * uncond_denoised + b2 * old_denoised)
-            else:
-                x = sigma_fn(h) * x + h * (b1 * denoised + b2 * old_denoised)
-
-        # Noise addition
+        x = denoised + d * sigma_down
        if sigmas[i + 1] > 0:
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
-
-        if cfg_pp:
-            old_denoised = uncond_denoised
-        else:
-            old_denoised = denoised
-    return x
-
-@torch.no_grad()
-def sample_res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=0., cfg_pp=True)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=False)
-
-@torch.no_grad()
-def sample_res_multistep_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
-    return res_multistep(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, s_noise=s_noise, noise_sampler=noise_sampler, eta=eta, cfg_pp=True)
-
-@torch.no_grad()
-def sample_gradient_estimation(model, x, sigmas, extra_args=None, callback=None, disable=None, ge_gamma=2.):
-    """Gradient-estimation sampler. Paper: https://openreview.net/pdf?id=o2ND9v0CeK"""
-    extra_args = {} if extra_args is None else extra_args
-    s_in = x.new_ones([x.shape[0]])
-    old_d = None
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        d = to_d(x, sigmas[i], denoised)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        dt = sigmas[i + 1] - sigmas[i]
-        if i == 0:
-            # Euler method
-            x = x + d * dt
-        else:
-            # Gradient estimation
-            d_bar = ge_gamma * d + (1 - ge_gamma) * old_d
-            x = x + d_bar * dt
-        old_d = d
-    return x
-
-@torch.no_grad()
-def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, s_noise=1., noise_sampler=None, noise_scaler=None, max_stage=3):
-    """
-    Extended Reverse-Time SDE solver (VE ER-SDE-Solver-3). Arxiv: https://arxiv.org/abs/2309.06169.
-    Code reference: https://github.com/QinpengCui/ER-SDE-Solver/blob/main/er_sde_solver.py.
-    """
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    def default_noise_scaler(sigma):
-        return sigma * ((sigma ** 0.3).exp() + 10.0)
-    noise_scaler = default_noise_scaler if noise_scaler is None else noise_scaler
-    num_integration_points = 200.0
-    point_indice = torch.arange(0, num_integration_points, dtype=torch.float32, device=x.device)
-
-    old_denoised = None
-    old_denoised_d = None
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        stage_used = min(max_stage, i + 1)
-        if sigmas[i + 1] == 0:
-            x = denoised
-        elif stage_used == 1:
-            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
-            x = r * x + (1 - r) * denoised
-        else:
-            r = noise_scaler(sigmas[i + 1]) / noise_scaler(sigmas[i])
-            x = r * x + (1 - r) * denoised
-
-            dt = sigmas[i + 1] - sigmas[i]
-            sigma_step_size = -dt / num_integration_points
-            sigma_pos = sigmas[i + 1] + point_indice * sigma_step_size
-            scaled_pos = noise_scaler(sigma_pos)
-
-            # Stage 2
-            s = torch.sum(1 / scaled_pos) * sigma_step_size
-            denoised_d = (denoised - old_denoised) / (sigmas[i] - sigmas[i - 1])
-            x = x + (dt + s * noise_scaler(sigmas[i + 1])) * denoised_d
-
-            if stage_used >= 3:
-                # Stage 3
-                s_u = torch.sum((sigma_pos - sigmas[i]) / scaled_pos) * sigma_step_size
-                denoised_u = (denoised_d - old_denoised_d) / ((sigmas[i] - sigmas[i - 2]) / 2)
-                x = x + ((dt ** 2) / 2 + s_u * noise_scaler(sigmas[i + 1])) * denoised_u
-            old_denoised_d = denoised_d
-
-        if s_noise != 0 and sigmas[i + 1] > 0:
-            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
-        old_denoised = denoised
-    return x
-
-@torch.no_grad()
-def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
-    '''
-    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
-    Arxiv: https://arxiv.org/abs/2305.14267
-    '''
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    inject_noise = eta > 0 and s_noise > 0
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = t_next - t
-            h_eta = h * (eta + 1)
-            s = t + r * h
-            fac = 1 / (2 * r)
-            sigma_s = s.neg().exp()
-
-            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
-            if inject_noise:
-                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
-                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
-                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
-
-            # Step 1
-            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
-            if inject_noise:
-                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
-            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
-
-            # Step 2
-            denoised_d = (1 - fac) * denoised + fac * denoised_2
-            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
-            if inject_noise:
-                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
-    return x
-
-@torch.no_grad()
-def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
-    '''
-    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
-    Arxiv: https://arxiv.org/abs/2305.14267
-    '''
-    extra_args = {} if extra_args is None else extra_args
-    seed = extra_args.get("seed", None)
-    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
-    s_in = x.new_ones([x.shape[0]])
-
-    inject_noise = eta > 0 and s_noise > 0
-
-    for i in trange(len(sigmas) - 1, disable=disable):
-        denoised = model(x, sigmas[i] * s_in, **extra_args)
-        if callback is not None:
-            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
-        if sigmas[i + 1] == 0:
-            x = denoised
-        else:
-            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
-            h = t_next - t
-            h_eta = h * (eta + 1)
-            s_1 = t + r_1 * h
-            s_2 = t + r_2 * h
-            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
-
-            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
-            if inject_noise:
-                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
-                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
-                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
-                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
-
-            # Step 1
-            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
-            if inject_noise:
-                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
-            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
-
-            # Step 2
-            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
-            if inject_noise:
-                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
-            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
-
-            # Step 3
-            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
-            if inject_noise:
-                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
    return x
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -3,9 +3,7 @@ import torch
 class LatentFormat:
    scale_factor = 1.0
    latent_channels = 4
-    latent_dimensions = 2
    latent_rgb_factors = None
-    latent_rgb_factors_bias = None
    taesd_decoder_name = None

    def process_in(self, latent):
@ -32,13 +30,11 @@ class SDXL(LatentFormat):
    def __init__(self):
        self.latent_rgb_factors = [
                    #   R        G        B
-                    [ 0.3651,  0.4232,  0.4341],
-                    [-0.2533, -0.0042,  0.1068],
-                    [ 0.1076,  0.1111, -0.0362],
-                    [-0.3165, -0.2492, -0.2188]
+                    [ 0.3920,  0.4054,  0.4549],
+                    [-0.2634, -0.0196,  0.0653],
+                    [ 0.0568,  0.1687, -0.0755],
+                    [-0.3112, -0.2359, -0.2076]
                ]
-        self.latent_rgb_factors_bias = [ 0.1084, -0.0175, -0.0011]
-
        self.taesd_decoder_name = "taesdxl_decoder"

 class SDXL_Playground_2_5(LatentFormat):
@ -116,24 +112,23 @@ class SD3(LatentFormat):
        self.scale_factor = 1.5305
        self.shift_factor = 0.0609
        self.latent_rgb_factors = [
-            [-0.0922, -0.0175,  0.0749],
-            [ 0.0311,  0.0633,  0.0954],
-            [ 0.1994,  0.0927,  0.0458],
-            [ 0.0856,  0.0339,  0.0902],
-            [ 0.0587,  0.0272, -0.0496],
-            [-0.0006,  0.1104,  0.0309],
-            [ 0.0978,  0.0306,  0.0427],
-            [-0.0042,  0.1038,  0.1358],
-            [-0.0194,  0.0020,  0.0669],
-            [-0.0488,  0.0130, -0.0268],
-            [ 0.0922,  0.0988,  0.0951],
-            [-0.0278,  0.0524, -0.0542],
-            [ 0.0332,  0.0456,  0.0895],
-            [-0.0069, -0.0030, -0.0810],
-            [-0.0596, -0.0465, -0.0293],
-            [-0.1448, -0.1463, -0.1189]
+            [-0.0645,  0.0177,  0.1052],
+            [ 0.0028,  0.0312,  0.0650],
+            [ 0.1848,  0.0762,  0.0360],
+            [ 0.0944,  0.0360,  0.0889],
+            [ 0.0897,  0.0506, -0.0364],
+            [-0.0020,  0.1203,  0.0284],
+            [ 0.0855,  0.0118,  0.0283],
+            [-0.0539,  0.0658,  0.1047],
+            [-0.0057,  0.0116,  0.0700],
+            [-0.0412,  0.0281, -0.0039],
+            [ 0.1106,  0.1171,  0.1220],
+            [-0.0248,  0.0682, -0.0481],
+            [ 0.0815,  0.0846,  0.1207],
+            [-0.0120, -0.0055, -0.0867],
+            [-0.0749, -0.0634, -0.0456],
+            [-0.1418, -0.1457, -0.1259]
        ]
-        self.latent_rgb_factors_bias = [0.2394, 0.2135, 0.1925]
        self.taesd_decoder_name = "taesd3_decoder"

    def process_in(self, latent):
@ -144,325 +139,32 @@ class SD3(LatentFormat):

 class StableAudio1(LatentFormat):
    latent_channels = 64
-    latent_dimensions = 1

 class Flux(SD3):
-    latent_channels = 16
    def __init__(self):
        self.scale_factor = 0.3611
        self.shift_factor = 0.1159
        self.latent_rgb_factors =[
-            [-0.0346,  0.0244,  0.0681],
-            [ 0.0034,  0.0210,  0.0687],
-            [ 0.0275, -0.0668, -0.0433],
-            [-0.0174,  0.0160,  0.0617],
-            [ 0.0859,  0.0721,  0.0329],
-            [ 0.0004,  0.0383,  0.0115],
-            [ 0.0405,  0.0861,  0.0915],
-            [-0.0236, -0.0185, -0.0259],
-            [-0.0245,  0.0250,  0.1180],
-            [ 0.1008,  0.0755, -0.0421],
-            [-0.0515,  0.0201,  0.0011],
-            [ 0.0428, -0.0012, -0.0036],
-            [ 0.0817,  0.0765,  0.0749],
-            [-0.1264, -0.0522, -0.1103],
-            [-0.0280, -0.0881, -0.0499],
-            [-0.1262, -0.0982, -0.0778]
+            [-0.0404,  0.0159,  0.0609],
+            [ 0.0043,  0.0298,  0.0850],
+            [ 0.0328, -0.0749, -0.0503],
+            [-0.0245,  0.0085,  0.0549],
+            [ 0.0966,  0.0894,  0.0530],
+            [ 0.0035,  0.0399,  0.0123],
+            [ 0.0583,  0.1184,  0.1262],
+            [-0.0191, -0.0206, -0.0306],
+            [-0.0324,  0.0055,  0.1001],
+            [ 0.0955,  0.0659, -0.0545],
+            [-0.0504,  0.0231, -0.0013],
+            [ 0.0500, -0.0008, -0.0088],
+            [ 0.0982,  0.0941,  0.0976],
+            [-0.1233, -0.0280, -0.0897],
+            [-0.0005, -0.0530, -0.0020],
+            [-0.1273, -0.0932, -0.0680]
        ]
-        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
-        self.taesd_decoder_name = "taef1_decoder"

    def process_in(self, latent):
        return (latent - self.shift_factor) * self.scale_factor

    def process_out(self, latent):
        return (latent / self.scale_factor) + self.shift_factor
-
-class Mochi(LatentFormat):
-    latent_channels = 12
-    latent_dimensions = 3
-
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latents_mean = torch.tensor([-0.06730895953510081, -0.038011381506090416, -0.07477820912866141,
-                                          -0.05565264470995561, 0.012767231469026969, -0.04703542746246419,
-                                          0.043896967884726704, -0.09346305707025976, -0.09918314763016893,
-                                          -0.008729793427399178, -0.011931556316503654, -0.0321993391887285]).view(1, self.latent_channels, 1, 1, 1)
-        self.latents_std = torch.tensor([0.9263795028493863, 0.9248894543193766, 0.9393059390890617,
-                                         0.959253732819592, 0.8244560132752793, 0.917259975397747,
-                                         0.9294154431013696, 1.3720942357788521, 0.881393668867029,
-                                         0.9168315692124348, 0.9185249279345552, 0.9274757570805041]).view(1, self.latent_channels, 1, 1, 1)
-
-        self.latent_rgb_factors =[
-            [-0.0069, -0.0045,  0.0018],
-            [ 0.0154, -0.0692, -0.0274],
-            [ 0.0333,  0.0019,  0.0206],
-            [-0.1390,  0.0628,  0.1678],
-            [-0.0725,  0.0134, -0.1898],
-            [ 0.0074, -0.0270, -0.0209],
-            [-0.0176, -0.0277, -0.0221],
-            [ 0.5294,  0.5204,  0.3852],
-            [-0.0326, -0.0446, -0.0143],
-            [-0.0659,  0.0153, -0.0153],
-            [ 0.0185, -0.0217,  0.0014],
-            [-0.0396, -0.0495, -0.0281]
-        ]
-        self.latent_rgb_factors_bias = [-0.0940, -0.1418, -0.1453]
-        self.taesd_decoder_name = None #TODO
-
-    def process_in(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return (latent - latents_mean) * self.scale_factor / latents_std
-
-    def process_out(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return latent * latents_std / self.scale_factor + latents_mean
-
-class LTXV(LatentFormat):
-    latent_channels = 128
-    latent_dimensions = 3
-
-    def __init__(self):
-        self.latent_rgb_factors = [
-            [ 1.1202e-02, -6.3815e-04, -1.0021e-02],
-            [ 8.6031e-02,  6.5813e-02,  9.5409e-04],
-            [-1.2576e-02, -7.5734e-03, -4.0528e-03],
-            [ 9.4063e-03, -2.1688e-03,  2.6093e-03],
-            [ 3.7636e-03,  1.2765e-02,  9.1548e-03],
-            [ 2.1024e-02, -5.2973e-03,  3.4373e-03],
-            [-8.8896e-03, -1.9703e-02, -1.8761e-02],
-            [-1.3160e-02, -1.0523e-02,  1.9709e-03],
-            [-1.5152e-03, -6.9891e-03, -7.5810e-03],
-            [-1.7247e-03,  4.6560e-04, -3.3839e-03],
-            [ 1.3617e-02,  4.7077e-03, -2.0045e-03],
-            [ 1.0256e-02,  7.7318e-03,  1.3948e-02],
-            [-1.6108e-02, -6.2151e-03,  1.1561e-03],
-            [ 7.3407e-03,  1.5628e-02,  4.4865e-04],
-            [ 9.5357e-04, -2.9518e-03, -1.4760e-02],
-            [ 1.9143e-02,  1.0868e-02,  1.2264e-02],
-            [ 4.4575e-03,  3.6682e-05, -6.8508e-03],
-            [-4.5681e-04,  3.2570e-03,  7.7929e-03],
-            [ 3.3902e-02,  3.3405e-02,  3.7454e-02],
-            [-2.3001e-02, -2.4877e-03, -3.1033e-03],
-            [ 5.0265e-02,  3.8841e-02,  3.3539e-02],
-            [-4.1018e-03, -1.1095e-03,  1.5859e-03],
-            [-1.2689e-01, -1.3107e-01, -2.1005e-01],
-            [ 2.6276e-02,  1.4189e-02, -3.5963e-03],
-            [-4.8679e-03,  8.8486e-03,  7.8029e-03],
-            [-1.6610e-03, -4.8597e-03, -5.2060e-03],
-            [-2.1010e-03,  2.3610e-03,  9.3796e-03],
-            [-2.2482e-02, -2.1305e-02, -1.5087e-02],
-            [-1.5753e-02, -1.0646e-02, -6.5083e-03],
-            [-4.6975e-03,  5.0288e-03, -6.7390e-03],
-            [ 1.1951e-02,  2.0712e-02,  1.6191e-02],
-            [-6.3704e-03, -8.4827e-03, -9.5483e-03],
-            [ 7.2610e-03, -9.9326e-03, -2.2978e-02],
-            [-9.1904e-04,  6.2882e-03,  9.5720e-03],
-            [-3.7178e-02, -3.7123e-02, -5.6713e-02],
-            [-1.3373e-01, -1.0720e-01, -5.3801e-02],
-            [-5.3702e-03,  8.1256e-03,  8.8397e-03],
-            [-1.5247e-01, -2.1437e-01, -2.1843e-01],
-            [ 3.1441e-02,  7.0335e-03, -9.7541e-03],
-            [ 2.1528e-03, -8.9817e-03, -2.1023e-02],
-            [ 3.8461e-03, -5.8957e-03, -1.5014e-02],
-            [-4.3470e-03, -1.2940e-02, -1.5972e-02],
-            [-5.4781e-03, -1.0842e-02, -3.0204e-03],
-            [-6.5347e-03,  3.0806e-03, -1.0163e-02],
-            [-5.0414e-03, -7.1503e-03, -8.9686e-04],
-            [-8.5851e-03, -2.4351e-03,  1.0674e-03],
-            [-9.0016e-03, -9.6493e-03,  1.5692e-03],
-            [ 5.0914e-03,  1.2099e-02,  1.9968e-02],
-            [ 1.3758e-02,  1.1669e-02,  8.1958e-03],
-            [-1.0518e-02, -1.1575e-02, -4.1307e-03],
-            [-2.8410e-02, -3.1266e-02, -2.2149e-02],
-            [ 2.9336e-03,  3.6511e-02,  1.8717e-02],
-            [-1.6703e-02, -1.6696e-02, -4.4529e-03],
-            [ 4.8818e-02,  4.0063e-02,  8.7410e-03],
-            [-1.5066e-02, -5.7328e-04,  2.9785e-03],
-            [-1.7613e-02, -8.1034e-03,  1.3086e-02],
-            [-9.2633e-03,  1.0803e-02, -6.3489e-03],
-            [ 3.0851e-03,  4.7750e-04,  1.2347e-02],
-            [-2.2785e-02, -2.3043e-02, -2.6005e-02],
-            [-2.4787e-02, -1.5389e-02, -2.2104e-02],
-            [-2.3572e-02,  1.0544e-03,  1.2361e-02],
-            [-7.8915e-03, -1.2271e-03, -6.0968e-03],
-            [-1.1478e-02, -1.2543e-03,  6.2679e-03],
-            [-5.4229e-02,  2.6644e-02,  6.3394e-03],
-            [ 4.4216e-03, -7.3338e-03, -1.0464e-02],
-            [-4.5013e-03,  1.6082e-03,  1.4420e-02],
-            [ 1.3673e-02,  8.8877e-03,  4.1253e-03],
-            [-1.0145e-02,  9.0072e-03,  1.5695e-02],
-            [-5.6234e-03,  1.1847e-03,  8.1261e-03],
-            [-3.7171e-03, -5.3538e-03,  1.2590e-03],
-            [ 2.9476e-02,  2.1424e-02,  3.0424e-02],
-            [-3.4925e-02, -2.4340e-02, -2.5316e-02],
-            [-3.4127e-02, -2.2406e-02, -1.0589e-02],
-            [-1.7342e-02, -1.3249e-02, -1.0719e-02],
-            [-2.1478e-03, -8.6051e-03, -2.9878e-03],
-            [ 1.2089e-03, -4.2391e-03, -6.8569e-03],
-            [ 9.0411e-04, -6.6886e-03, -6.7547e-05],
-            [ 1.6048e-02, -1.0057e-02, -2.8929e-02],
-            [ 1.2290e-03,  1.0163e-02,  1.8861e-02],
-            [ 1.7264e-02,  2.7257e-04,  1.3785e-02],
-            [-1.3482e-02, -3.6427e-03,  6.7481e-04],
-            [ 4.6782e-03, -5.2423e-03,  2.4467e-03],
-            [-5.9113e-03, -6.2244e-03, -1.8162e-03],
-            [ 1.5496e-02,  1.4582e-02,  1.9514e-03],
-            [ 7.4958e-03,  1.5886e-03, -8.2305e-03],
-            [ 1.9086e-02,  1.6360e-03, -3.9674e-03],
-            [-5.7021e-03, -2.7307e-03, -4.1066e-03],
-            [ 1.7450e-03,  1.4602e-02,  2.5794e-02],
-            [-8.2788e-04,  2.2902e-03,  4.5161e-03],
-            [ 1.1632e-02,  8.9193e-03, -7.2813e-03],
-            [ 7.5721e-03,  2.6784e-03,  1.1393e-02],
-            [ 5.1939e-03,  3.6903e-03,  1.4049e-02],
-            [-1.8383e-02, -2.2529e-02, -2.4477e-02],
-            [ 5.8842e-04, -5.7874e-03, -1.4770e-02],
-            [-1.6125e-02, -8.6101e-03, -1.4533e-02],
-            [ 2.0540e-02,  2.0729e-02,  6.4338e-03],
-            [ 3.3587e-03, -1.1226e-02, -1.6444e-02],
-            [-1.4742e-03, -1.0489e-02,  1.7097e-03],
-            [ 2.8130e-02,  2.3546e-02,  3.2791e-02],
-            [-1.8532e-02, -1.2842e-02, -8.7756e-03],
-            [-8.0533e-03, -1.0771e-02, -1.7536e-02],
-            [-3.9009e-03,  1.6150e-02,  3.3359e-02],
-            [-7.4554e-03, -1.4154e-02, -6.1910e-03],
-            [ 3.4734e-03, -1.1370e-02, -1.0581e-02],
-            [ 1.1476e-02,  3.9281e-03,  2.8231e-03],
-            [ 7.1639e-03, -1.4741e-03, -3.8066e-03],
-            [ 2.2250e-03, -8.7552e-03, -9.5719e-03],
-            [ 2.4146e-02,  2.1696e-02,  2.8056e-02],
-            [-5.4365e-03, -2.4291e-02, -1.7802e-02],
-            [ 7.4263e-03,  1.0510e-02,  1.2705e-02],
-            [ 6.2669e-03,  6.2658e-03,  1.9211e-02],
-            [ 1.6378e-02,  9.4933e-03,  6.6971e-03],
-            [ 1.7173e-02,  2.3601e-02,  2.3296e-02],
-            [-1.4568e-02, -9.8279e-03, -1.1556e-02],
-            [ 1.4431e-02,  1.4430e-02,  6.6362e-03],
-            [-6.8230e-03,  1.8863e-02,  1.4555e-02],
-            [ 6.1156e-03,  3.4700e-03, -2.6662e-03],
-            [-2.6983e-03, -5.9402e-03, -9.2276e-03],
-            [ 1.0235e-02,  7.4173e-03, -7.6243e-03],
-            [-1.3255e-02,  1.9322e-02, -9.2153e-04],
-            [ 2.4222e-03, -4.8039e-03, -1.5759e-02],
-            [ 2.6244e-02,  2.5951e-02,  2.0249e-02],
-            [ 1.5711e-02,  1.8498e-02,  2.7407e-03],
-            [-2.1714e-03,  4.7214e-03, -2.2443e-02],
-            [-7.4747e-03,  7.4166e-03,  1.4430e-02],
-            [-8.3906e-03, -7.9776e-03,  9.7927e-03],
-            [ 3.8321e-02,  9.6622e-03, -1.9268e-02],
-            [-1.4605e-02, -6.7032e-03,  3.9675e-03]
-        ]
-
-        self.latent_rgb_factors_bias = [-0.0571, -0.1657, -0.2512]
-
-class HunyuanVideo(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-    scale_factor = 0.476986
-    latent_rgb_factors = [
-        [-0.0395, -0.0331,  0.0445],
-        [ 0.0696,  0.0795,  0.0518],
-        [ 0.0135, -0.0945, -0.0282],
-        [ 0.0108, -0.0250, -0.0765],
-        [-0.0209,  0.0032,  0.0224],
-        [-0.0804, -0.0254, -0.0639],
-        [-0.0991,  0.0271, -0.0669],
-        [-0.0646, -0.0422, -0.0400],
-        [-0.0696, -0.0595, -0.0894],
-        [-0.0799, -0.0208, -0.0375],
-        [ 0.1166,  0.1627,  0.0962],
-        [ 0.1165,  0.0432,  0.0407],
-        [-0.2315, -0.1920, -0.1355],
-        [-0.0270,  0.0401, -0.0821],
-        [-0.0616, -0.0997, -0.0727],
-        [ 0.0249, -0.0469, -0.1703]
-    ]
-
-    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
-
-class Cosmos1CV8x8x8(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-
-    latent_rgb_factors = [
-        [ 0.1817,  0.2284,  0.2423],
-        [-0.0586, -0.0862, -0.3108],
-        [-0.4703, -0.4255, -0.3995],
-        [ 0.0803,  0.1963,  0.1001],
-        [-0.0820, -0.1050,  0.0400],
-        [ 0.2511,  0.3098,  0.2787],
-        [-0.1830, -0.2117, -0.0040],
-        [-0.0621, -0.2187, -0.0939],
-        [ 0.3619,  0.1082,  0.1455],
-        [ 0.3164,  0.3922,  0.2575],
-        [ 0.1152,  0.0231, -0.0462],
-        [-0.1434, -0.3609, -0.3665],
-        [ 0.0635,  0.1471,  0.1680],
-        [-0.3635, -0.1963, -0.3248],
-        [-0.1865,  0.0365,  0.2346],
-        [ 0.0447,  0.0994,  0.0881]
-    ]
-
-    latent_rgb_factors_bias = [-0.1223, -0.1889, -0.1976]
-
-class Wan21(LatentFormat):
-    latent_channels = 16
-    latent_dimensions = 3
-
-    latent_rgb_factors = [
-            [-0.1299, -0.1692,  0.2932],
-            [ 0.0671,  0.0406,  0.0442],
-            [ 0.3568,  0.2548,  0.1747],
-            [ 0.0372,  0.2344,  0.1420],
-            [ 0.0313,  0.0189, -0.0328],
-            [ 0.0296, -0.0956, -0.0665],
-            [-0.3477, -0.4059, -0.2925],
-            [ 0.0166,  0.1902,  0.1975],
-            [-0.0412,  0.0267, -0.1364],
-            [-0.1293,  0.0740,  0.1636],
-            [ 0.0680,  0.3019,  0.1128],
-            [ 0.0032,  0.0581,  0.0639],
-            [-0.1251,  0.0927,  0.1699],
-            [ 0.0060, -0.0633,  0.0005],
-            [ 0.3477,  0.2275,  0.2950],
-            [ 0.1984,  0.0913,  0.1861]
-        ]
-
-    latent_rgb_factors_bias = [-0.1835, -0.0868, -0.3360]
-
-    def __init__(self):
-        self.scale_factor = 1.0
-        self.latents_mean = torch.tensor([
-            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
-            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
-        ]).view(1, self.latent_channels, 1, 1, 1)
-        self.latents_std = torch.tensor([
-            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
-            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
-        ]).view(1, self.latent_channels, 1, 1, 1)
-
-
-        self.taesd_decoder_name = None #TODO
-
-    def process_in(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return (latent - latents_mean) * self.scale_factor / latents_std
-
-    def process_out(self, latent):
-        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
-        latents_std = self.latents_std.to(latent.device, latent.dtype)
-        return latent * latents_std / self.scale_factor + latents_mean
-
-class Hunyuan3Dv2(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 0.9990943042622529
-
-class Hunyuan3Dv2mini(LatentFormat):
-    latent_channels = 64
-    latent_dimensions = 1
-    scale_factor = 1.0188137142395404
--- a/comfy/ldm/audio/autoencoder.py
+++ b/comfy/ldm/audio/autoencoder.py
@ -2,7 +2,7 @@

 import torch
 from torch import nn
-from typing import Literal
+from typing import Literal, Dict, Any
 import math
 import comfy.ops
 ops = comfy.ops.disable_weight_init
@ -97,7 +97,7 @@ def get_activation(activation: Literal["elu", "snake", "none"], antialias=False,
        raise ValueError(f"Unknown activation {activation}")

    if antialias:
-        act = Activation1d(act)  # noqa: F821 Activation1d is not defined
+        act = Activation1d(act)

    return act

--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@ -158,6 +158,7 @@ class RotaryEmbedding(nn.Module):
    def forward(self, t):
        # device = self.inv_freq.device
        device = t.device
+        dtype = t.dtype

        # t = t.to(torch.float32)

@ -169,7 +170,7 @@ class RotaryEmbedding(nn.Module):
        if self.scale is None:
            return freqs, 1.

-        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base  # noqa: F821 seq_len is not defined
+        power = (torch.arange(seq_len, device = device) - (seq_len // 2)) / self.scale_base
        scale = comfy.ops.cast_to_input(self.scale, t) ** rearrange(power, 'n -> n 1')
        scale = torch.cat((scale, scale), dim = -1)

@ -228,9 +229,9 @@ class FeedForward(nn.Module):
            linear_in = GLU(dim, inner_dim, activation, dtype=dtype, device=device, operations=operations)
        else:
            linear_in = nn.Sequential(
-                rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
                operations.Linear(dim, inner_dim, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(dim, inner_dim, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device),
-                rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+                Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
                activation
            )

@ -245,9 +246,9 @@ class FeedForward(nn.Module):

        self.ff = nn.Sequential(
            linear_in,
-            rearrange('b d n -> b n d') if use_conv else nn.Identity(),
+            Rearrange('b d n -> b n d') if use_conv else nn.Identity(),
            linear_out,
-            rearrange('b n d -> b d n') if use_conv else nn.Identity(),
+            Rearrange('b n d -> b d n') if use_conv else nn.Identity(),
        )

    def forward(self, x):
@ -345,13 +346,18 @@ class Attention(nn.Module):

        # determine masking
        masks = []
+        final_attn_mask = None # The mask that will be applied to the attention matrix, taking all masks into account

        if input_mask is not None:
            input_mask = rearrange(input_mask, 'b j -> b 1 1 j')
            masks.append(~input_mask)

        # Other masks will be added here later
-        n = q.shape[-2]
+
+        if len(masks) > 0:
+            final_attn_mask = ~or_reduce(masks)
+
+        n, device = q.shape[-2], q.device

        causal = self.causal if causal is None else causal

@ -606,9 +612,7 @@ class ContinuousTransformer(nn.Module):
        return_info = False,
        **kwargs
    ):
-        patches_replace = kwargs.get("transformer_options", {}).get("patches_replace", {})
        batch, seq, device = *x.shape[:2], x.device
-        context = kwargs["context"]

        info = {
            "hidden_states": [],
@ -639,19 +643,9 @@ class ContinuousTransformer(nn.Module):
        if self.use_sinusoidal_emb or self.use_abs_pos_emb:
            x = x + self.pos_emb(x)

-        blocks_replace = patches_replace.get("dit", {})
        # Iterate over the transformer layers
-        for i, layer in enumerate(self.layers):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = layer(args["img"], rotary_pos_emb=args["pe"], global_cond=args["vec"], context=args["txt"])
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb}, {"original_block": block_wrap})
-                x = out["img"]
-            else:
-                x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context)
+        for layer in self.layers:
+            x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
            # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)

            if return_info:
@ -880,6 +874,7 @@ class AudioDiffusionTransformer(nn.Module):
        mask=None,
        return_info=False,
        control=None,
+        transformer_options={},
        **kwargs):
            return self._forward(
                x,
--- a/comfy/ldm/audio/embedders.py
+++ b/comfy/ldm/audio/embedders.py
@ -2,8 +2,8 @@

 import torch
 import torch.nn as nn
-from torch import Tensor
-from typing import List, Union
+from torch import Tensor, einsum
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
 from einops import rearrange
 import math
 import comfy.ops
--- a/comfy/ldm/aura/mmdit.py
+++ b/comfy/ldm/aura/mmdit.py
@ -147,6 +147,7 @@ class DoubleAttention(nn.Module):

        bsz, seqlen1, _ = c.shape
        bsz, seqlen2, _ = x.shape
+        seqlen = seqlen1 + seqlen2

        cq, ck, cv = self.w1q(c), self.w1k(c), self.w1v(c)
        cq = cq.view(bsz, seqlen1, self.n_heads, self.head_dim)
@ -381,6 +382,7 @@ class MMDiT(nn.Module):
        pe_new = pe_as_2d.squeeze(0).permute(1, 2, 0).flatten(0, 1)
        self.positional_encoding.data = pe_new.unsqueeze(0).contiguous()
        self.h_max, self.w_max = target_dim
+        print("PE extended to", target_dim)

    def pe_selection_index_based_on_dim(self, h, w):
        h_p, w_p = h // self.patch_size, w // self.patch_size
@ -435,8 +437,7 @@ class MMDiT(nn.Module):
        pos_encoding = pos_encoding[:,from_h:from_h+h,from_w:from_w+w]
        return x + pos_encoding.reshape(1, -1, self.positional_encoding.shape[-1])

-    def forward(self, x, timestep, context, transformer_options={}, **kwargs):
-        patches_replace = transformer_options.get("patches_replace", {})
+    def forward(self, x, timestep, context, **kwargs):
        # patchify x, add PE
        b, c, h, w = x.shape

@ -457,35 +458,14 @@ class MMDiT(nn.Module):

        global_cond = self.t_embedder(t, x.dtype)  # B, D

-        blocks_replace = patches_replace.get("dit", {})
        if len(self.double_layers) > 0:
-            for i, layer in enumerate(self.double_layers):
-                if ("double_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["txt"], out["img"] = layer(args["txt"],
-                                                       args["img"],
-                                                       args["vec"])
-                        return out
-                    out = blocks_replace[("double_block", i)]({"img": x, "txt": c, "vec": global_cond}, {"original_block": block_wrap})
-                    c = out["txt"]
-                    x = out["img"]
-                else:
+            for layer in self.double_layers:
                c, x = layer(c, x, global_cond, **kwargs)

        if len(self.single_layers) > 0:
            c_len = c.size(1)
            cx = torch.cat([c, x], dim=1)
-            for i, layer in enumerate(self.single_layers):
-                if ("single_block", i) in blocks_replace:
-                    def block_wrap(args):
-                        out = {}
-                        out["img"] = layer(args["img"], args["vec"])
-                        return out
-
-                    out = blocks_replace[("single_block", i)]({"img": cx, "vec": global_cond}, {"original_block": block_wrap})
-                    cx = out["img"]
-                else:
+            for layer in self.single_layers:
                cx = layer(cx, global_cond, **kwargs)

            x = cx[:, c_len:]
--- a/comfy/ldm/cascade/controlnet.py
+++ b/comfy/ldm/cascade/controlnet.py
@ -16,6 +16,7 @@
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """

+import torch
 import torchvision
 from torch import nn
 from .common import LayerNorm2d_op
--- a/comfy/ldm/cascade/stage_a.py
+++ b/comfy/ldm/cascade/stage_a.py
@ -19,10 +19,6 @@
 import torch
 from torch import nn
 from torch.autograd import Function
-import comfy.ops
-
-ops = comfy.ops.disable_weight_init
-

 class vector_quantize(Function):
    @staticmethod
@ -125,15 +121,15 @@ class ResBlock(nn.Module):
        self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.depthwise = nn.Sequential(
            nn.ReplicationPad2d(1),
-            ops.Conv2d(c, c, kernel_size=3, groups=c)
+            nn.Conv2d(c, c, kernel_size=3, groups=c)
        )

        # channelwise
        self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
        self.channelwise = nn.Sequential(
-            ops.Linear(c, c_hidden),
+            nn.Linear(c, c_hidden),
            nn.GELU(),
-            ops.Linear(c_hidden, c),
+            nn.Linear(c_hidden, c),
        )

        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
@ -175,16 +171,16 @@ class StageA(nn.Module):
        # Encoder blocks
        self.in_block = nn.Sequential(
            nn.PixelUnshuffle(2),
-            ops.Conv2d(3 * 4, c_levels[0], kernel_size=1)
+            nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
        )
        down_blocks = []
        for i in range(levels):
            if i > 0:
-                down_blocks.append(ops.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
            block = ResBlock(c_levels[i], c_levels[i] * 4)
            down_blocks.append(block)
        down_blocks.append(nn.Sequential(
-            ops.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent),  # then normalize them to have mean 0 and std 1
        ))
        self.down_blocks = nn.Sequential(*down_blocks)
@ -195,7 +191,7 @@ class StageA(nn.Module):

        # Decoder blocks
        up_blocks = [nn.Sequential(
-            ops.Conv2d(c_latent, c_levels[-1], kernel_size=1)
+            nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
        )]
        for i in range(levels):
            for j in range(bottleneck_blocks if i == 0 else 1):
@ -203,11 +199,11 @@ class StageA(nn.Module):
                up_blocks.append(block)
            if i < levels - 1:
                up_blocks.append(
-                    ops.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
+                    nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
                                       padding=1))
        self.up_blocks = nn.Sequential(*up_blocks)
        self.out_block = nn.Sequential(
-            ops.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
+            nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
            nn.PixelShuffle(2),
        )

@ -236,17 +232,17 @@ class Discriminator(nn.Module):
        super().__init__()
        d = max(depth - 3, 3)
        layers = [
-            nn.utils.spectral_norm(ops.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
+            nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
            nn.LeakyReLU(0.2),
        ]
        for i in range(depth - 1):
            c_in = c_hidden // (2 ** max((d - i), 0))
            c_out = c_hidden // (2 ** max((d - 1 - i), 0))
-            layers.append(nn.utils.spectral_norm(ops.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
            layers.append(nn.InstanceNorm2d(c_out))
            layers.append(nn.LeakyReLU(0.2))
        self.encoder = nn.Sequential(*layers)
-        self.shuffle = ops.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
+        self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
        self.logits = nn.Sigmoid()

    def forward(self, x, cond=None):
--- a/comfy/ldm/cascade/stage_c_coder.py
+++ b/comfy/ldm/cascade/stage_c_coder.py
@ -19,9 +19,6 @@ import torch
 import torchvision
 from torch import nn

-import comfy.ops
-
-ops = comfy.ops.disable_weight_init

 # EfficientNet
 class EfficientNetEncoder(nn.Module):
@ -29,7 +26,7 @@ class EfficientNetEncoder(nn.Module):
        super().__init__()
        self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
        self.mapper = nn.Sequential(
-            ops.Conv2d(1280, c_latent, kernel_size=1, bias=False),
+            nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
            nn.BatchNorm2d(c_latent, affine=False),  # then normalize them to have mean 0 and std 1
        )
        self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
@ -37,7 +34,7 @@ class EfficientNetEncoder(nn.Module):

    def forward(self, x):
        x = x * 0.5 + 0.5
-        x = (x - self.mean.view([3,1,1]).to(device=x.device, dtype=x.dtype)) / self.std.view([3,1,1]).to(device=x.device, dtype=x.dtype)
+        x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
        o = self.mapper(self.backbone(x))
        return o

@ -47,39 +44,39 @@ class Previewer(nn.Module):
    def __init__(self, c_in=16, c_hidden=512, c_out=3):
        super().__init__()
        self.blocks = nn.Sequential(
-            ops.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
+            nn.Conv2d(c_in, c_hidden, kernel_size=1),  # 16 channels to 512 channels
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden),

-            ops.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
+            nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2),  # 16 -> 32
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 2),

-            ops.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
+            nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2),  # 32 -> 64
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
+            nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2),  # 64 -> 128
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
+            nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
            nn.GELU(),
            nn.BatchNorm2d(c_hidden // 4),

-            ops.Conv2d(c_hidden // 4, c_out, kernel_size=1),
+            nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
        )

    def forward(self, x):
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@ -1,16 +1,8 @@
 import torch
-import comfy.rmsnorm
-

 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
-    if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
+    if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
        padding_mode = "reflect"
-
-    pad = ()
-    for i in range(img.ndim - 2):
-        pad = (0, (patch_size[i] - img.shape[i + 2] % patch_size[i]) % patch_size[i]) + pad
-
-    return torch.nn.functional.pad(img, pad, mode=padding_mode)
-
-
-rms_norm = comfy.rmsnorm.rms_norm
+    pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
+    pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
+    return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
--- a/comfy/ldm/cosmos/blocks.py
+++ b/comfy/ldm/cosmos/blocks.py
@ -1,808 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional
-import logging
-
-import numpy as np
-import torch
-from einops import rearrange, repeat
-from einops.layers.torch import Rearrange
-from torch import nn
-
-from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
-from comfy.ldm.modules.attention import optimized_attention
-
-
-def apply_rotary_pos_emb(
-    t: torch.Tensor,
-    freqs: torch.Tensor,
-) -> torch.Tensor:
-    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
-    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
-    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
-    return t_out
-
-
-def get_normalization(name: str, channels: int, weight_args={}):
-    if name == "I":
-        return nn.Identity()
-    elif name == "R":
-        return RMSNorm(channels, elementwise_affine=True, eps=1e-6, **weight_args)
-    else:
-        raise ValueError(f"Normalization {name} not found")
-
-
-class BaseAttentionOp(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-
-class Attention(nn.Module):
-    """
-    Generalized attention impl.
-
-    Allowing for both self-attention and cross-attention configurations depending on whether a `context_dim` is provided.
-    If `context_dim` is None, self-attention is assumed.
-
-    Parameters:
-        query_dim (int): Dimension of each query vector.
-        context_dim (int, optional): Dimension of each context vector. If None, self-attention is assumed.
-        heads (int, optional): Number of attention heads. Defaults to 8.
-        dim_head (int, optional): Dimension of each head. Defaults to 64.
-        dropout (float, optional): Dropout rate applied to the output of the attention block. Defaults to 0.0.
-        attn_op (BaseAttentionOp, optional): Custom attention operation to be used instead of the default.
-        qkv_bias (bool, optional): If True, adds a learnable bias to query, key, and value projections. Defaults to False.
-        out_bias (bool, optional): If True, adds a learnable bias to the output projection. Defaults to False.
-        qkv_norm (str, optional): A string representing normalization strategies for query, key, and value projections.
-                                  Defaults to "SSI".
-        qkv_norm_mode (str, optional): A string representing normalization mode for query, key, and value projections.
-                                        Defaults to 'per_head'. Only support 'per_head'.
-
-    Examples:
-        >>> attn = Attention(query_dim=128, context_dim=256, heads=4, dim_head=32, dropout=0.1)
-        >>> query = torch.randn(10, 128)  # Batch size of 10
-        >>> context = torch.randn(10, 256)  # Batch size of 10
-        >>> output = attn(query, context)  # Perform the attention operation
-
-    Note:
-        https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
-    """
-
-    def __init__(
-        self,
-        query_dim: int,
-        context_dim=None,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        attn_op: Optional[BaseAttentionOp] = None,
-        qkv_bias: bool = False,
-        out_bias: bool = False,
-        qkv_norm: str = "SSI",
-        qkv_norm_mode: str = "per_head",
-        backend: str = "transformer_engine",
-        qkv_format: str = "bshd",
-        weight_args={},
-        operations=None,
-    ) -> None:
-        super().__init__()
-
-        self.is_selfattn = context_dim is None  # self attention
-
-        inner_dim = dim_head * heads
-        context_dim = query_dim if context_dim is None else context_dim
-
-        self.heads = heads
-        self.dim_head = dim_head
-        self.qkv_norm_mode = qkv_norm_mode
-        self.qkv_format = qkv_format
-
-        if self.qkv_norm_mode == "per_head":
-            norm_dim = dim_head
-        else:
-            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
-
-        self.backend = backend
-
-        self.to_q = nn.Sequential(
-            operations.Linear(query_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[0], norm_dim),
-        )
-        self.to_k = nn.Sequential(
-            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[1], norm_dim),
-        )
-        self.to_v = nn.Sequential(
-            operations.Linear(context_dim, inner_dim, bias=qkv_bias, **weight_args),
-            get_normalization(qkv_norm[2], norm_dim),
-        )
-
-        self.to_out = nn.Sequential(
-            operations.Linear(inner_dim, query_dim, bias=out_bias, **weight_args),
-            nn.Dropout(dropout),
-        )
-
-    def cal_qkv(
-        self, x, context=None, mask=None, rope_emb=None, **kwargs
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        del kwargs
-
-
-        """
-        self.to_q, self.to_k, self.to_v are nn.Sequential with projection + normalization layers.
-        Before 07/24/2024, these modules normalize across all heads.
-        After 07/24/2024, to support tensor parallelism and follow the common practice in the community,
-        we support to normalize per head.
-        To keep the checkpoint copatibility with the previous code,
-        we keep the nn.Sequential but call the projection and the normalization layers separately.
-        We use a flag `self.qkv_norm_mode` to control the normalization behavior.
-        The default value of `self.qkv_norm_mode` is "per_head", which means we normalize per head.
-        """
-        if self.qkv_norm_mode == "per_head":
-            q = self.to_q[0](x)
-            context = x if context is None else context
-            k = self.to_k[0](context)
-            v = self.to_v[0](context)
-            q, k, v = map(
-                lambda t: rearrange(t, "s b (n c) -> b n s c", n=self.heads, c=self.dim_head),
-                (q, k, v),
-            )
-        else:
-            raise ValueError(f"Normalization mode {self.qkv_norm_mode} not found, only support 'per_head'")
-
-        q = self.to_q[1](q)
-        k = self.to_k[1](k)
-        v = self.to_v[1](v)
-        if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
-            # apply_rotary_pos_emb inlined
-            q_shape = q.shape
-            q = q.reshape(*q.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            q = rope_emb[..., 0] * q[..., 0] + rope_emb[..., 1] * q[..., 1]
-            q = q.movedim(-1, -2).reshape(*q_shape).to(x.dtype)
-
-            # apply_rotary_pos_emb inlined
-            k_shape = k.shape
-            k = k.reshape(*k.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2)
-            k = rope_emb[..., 0] * k[..., 0] + rope_emb[..., 1] * k[..., 1]
-            k = k.movedim(-1, -2).reshape(*k_shape).to(x.dtype)
-        return q, k, v
-
-    def forward(
-        self,
-        x,
-        context=None,
-        mask=None,
-        rope_emb=None,
-        **kwargs,
-    ):
-        """
-        Args:
-            x (Tensor): The query tensor of shape [B, Mq, K]
-            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
-        """
-        q, k, v = self.cal_qkv(x, context, mask, rope_emb=rope_emb, **kwargs)
-        out = optimized_attention(q, k, v, self.heads, skip_reshape=True, mask=mask, skip_output_reshape=True)
-        del q, k, v
-        out = rearrange(out, " b n s c -> s b (n c)")
-        return self.to_out(out)
-
-
-class FeedForward(nn.Module):
-    """
-    Transformer FFN with optional gating
-
-    Parameters:
-        d_model (int): Dimensionality of input features.
-        d_ff (int): Dimensionality of the hidden layer.
-        dropout (float, optional): Dropout rate applied after the activation function. Defaults to 0.1.
-        activation (callable, optional): The activation function applied after the first linear layer.
-                                         Defaults to nn.ReLU().
-        is_gated (bool, optional): If set to True, incorporates gating mechanism to the feed-forward layer.
-                                   Defaults to False.
-        bias (bool, optional): If set to True, adds a bias to the linear layers. Defaults to True.
-
-    Example:
-        >>> ff = FeedForward(d_model=512, d_ff=2048)
-        >>> x = torch.randn(64, 10, 512)  # Example input tensor
-        >>> output = ff(x)
-        >>> print(output.shape)  # Expected shape: (64, 10, 512)
-    """
-
-    def __init__(
-        self,
-        d_model: int,
-        d_ff: int,
-        dropout: float = 0.1,
-        activation=nn.ReLU(),
-        is_gated: bool = False,
-        bias: bool = False,
-        weight_args={},
-        operations=None,
-    ) -> None:
-        super().__init__()
-
-        self.layer1 = operations.Linear(d_model, d_ff, bias=bias, **weight_args)
-        self.layer2 = operations.Linear(d_ff, d_model, bias=bias, **weight_args)
-
-        self.dropout = nn.Dropout(dropout)
-        self.activation = activation
-        self.is_gated = is_gated
-        if is_gated:
-            self.linear_gate = operations.Linear(d_model, d_ff, bias=False, **weight_args)
-
-    def forward(self, x: torch.Tensor):
-        g = self.activation(self.layer1(x))
-        if self.is_gated:
-            x = g * self.linear_gate(x)
-        else:
-            x = g
-        assert self.dropout.p == 0.0, "we skip dropout"
-        return self.layer2(x)
-
-
-class GPT2FeedForward(FeedForward):
-    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1, bias: bool = False, weight_args={}, operations=None):
-        super().__init__(
-            d_model=d_model,
-            d_ff=d_ff,
-            dropout=dropout,
-            activation=nn.GELU(),
-            is_gated=False,
-            bias=bias,
-            weight_args=weight_args,
-            operations=operations,
-        )
-
-    def forward(self, x: torch.Tensor):
-        assert self.dropout.p == 0.0, "we skip dropout"
-
-        x = self.layer1(x)
-        x = self.activation(x)
-        x = self.layer2(x)
-
-        return x
-
-
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-
-class Timesteps(nn.Module):
-    def __init__(self, num_channels):
-        super().__init__()
-        self.num_channels = num_channels
-
-    def forward(self, timesteps):
-        half_dim = self.num_channels // 2
-        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
-        exponent = exponent / (half_dim - 0.0)
-
-        emb = torch.exp(exponent)
-        emb = timesteps[:, None].float() * emb[None, :]
-
-        sin_emb = torch.sin(emb)
-        cos_emb = torch.cos(emb)
-        emb = torch.cat([cos_emb, sin_emb], dim=-1)
-
-        return emb
-
-
-class TimestepEmbedding(nn.Module):
-    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False, weight_args={}, operations=None):
-        super().__init__()
-        logging.debug(
-            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
-        )
-        self.linear_1 = operations.Linear(in_features, out_features, bias=not use_adaln_lora, **weight_args)
-        self.activation = nn.SiLU()
-        self.use_adaln_lora = use_adaln_lora
-        if use_adaln_lora:
-            self.linear_2 = operations.Linear(out_features, 3 * out_features, bias=False, **weight_args)
-        else:
-            self.linear_2 = operations.Linear(out_features, out_features, bias=True, **weight_args)
-
-    def forward(self, sample: torch.Tensor) -> torch.Tensor:
-        emb = self.linear_1(sample)
-        emb = self.activation(emb)
-        emb = self.linear_2(emb)
-
-        if self.use_adaln_lora:
-            adaln_lora_B_3D = emb
-            emb_B_D = sample
-        else:
-            emb_B_D = emb
-            adaln_lora_B_3D = None
-
-        return emb_B_D, adaln_lora_B_3D
-
-
-class FourierFeatures(nn.Module):
-    """
-    Implements a layer that generates Fourier features from input tensors, based on randomly sampled
-    frequencies and phases. This can help in learning high-frequency functions in low-dimensional problems.
-
-    [B] -> [B, D]
-
-    Parameters:
-        num_channels (int): The number of Fourier features to generate.
-        bandwidth (float, optional): The scaling factor for the frequency of the Fourier features. Defaults to 1.
-        normalize (bool, optional): If set to True, the outputs are scaled by sqrt(2), usually to normalize
-                                    the variance of the features. Defaults to False.
-
-    Example:
-        >>> layer = FourierFeatures(num_channels=256, bandwidth=0.5, normalize=True)
-        >>> x = torch.randn(10, 256)  # Example input tensor
-        >>> output = layer(x)
-        >>> print(output.shape)  # Expected shape: (10, 256)
-    """
-
-    def __init__(self, num_channels, bandwidth=1, normalize=False):
-        super().__init__()
-        self.register_buffer("freqs", 2 * np.pi * bandwidth * torch.randn(num_channels), persistent=True)
-        self.register_buffer("phases", 2 * np.pi * torch.rand(num_channels), persistent=True)
-        self.gain = np.sqrt(2) if normalize else 1
-
-    def forward(self, x, gain: float = 1.0):
-        """
-        Apply the Fourier feature transformation to the input tensor.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-            gain (float, optional): An additional gain factor applied during the forward pass. Defaults to 1.
-
-        Returns:
-            torch.Tensor: The transformed tensor, with Fourier features applied.
-        """
-        in_dtype = x.dtype
-        x = x.to(torch.float32).ger(self.freqs.to(torch.float32)).add(self.phases.to(torch.float32))
-        x = x.cos().mul(self.gain * gain).to(in_dtype)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    """
-    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
-    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
-    making it suitable for video and image processing tasks. It supports dividing the input into patches
-    and embedding each patch into a vector of size `out_channels`.
-
-    Parameters:
-    - spatial_patch_size (int): The size of each spatial patch.
-    - temporal_patch_size (int): The size of each temporal patch.
-    - in_channels (int): Number of input channels. Default: 3.
-    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
-    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
-    """
-
-    def __init__(
-        self,
-        spatial_patch_size,
-        temporal_patch_size,
-        in_channels=3,
-        out_channels=768,
-        bias=True,
-        weight_args={},
-        operations=None,
-    ):
-        super().__init__()
-        self.spatial_patch_size = spatial_patch_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.proj = nn.Sequential(
-            Rearrange(
-                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
-                r=temporal_patch_size,
-                m=spatial_patch_size,
-                n=spatial_patch_size,
-            ),
-            operations.Linear(
-                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=bias, **weight_args
-            ),
-        )
-        self.out = nn.Identity()
-
-    def forward(self, x):
-        """
-        Forward pass of the PatchEmbed module.
-
-        Parameters:
-        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
-            B is the batch size,
-            C is the number of channels,
-            T is the temporal dimension,
-            H is the height, and
-            W is the width of the input.
-
-        Returns:
-        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
-        """
-        assert x.dim() == 5
-        _, _, T, H, W = x.shape
-        assert H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
-        assert T % self.temporal_patch_size == 0
-        x = self.proj(x)
-        return self.out(x)
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of video DiT.
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        spatial_patch_size,
-        temporal_patch_size,
-        out_channels,
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        weight_args={},
-        operations=None,
-    ):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, **weight_args)
-        self.linear = operations.Linear(
-            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, **weight_args
-        )
-        self.hidden_size = hidden_size
-        self.n_adaln_chunks = 2
-        self.use_adaln_lora = use_adaln_lora
-        if use_adaln_lora:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(hidden_size, adaln_lora_dim, bias=False, **weight_args),
-                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False, **weight_args),
-            )
-        else:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(), operations.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False, **weight_args)
-            )
-
-    def forward(
-        self,
-        x_BT_HW_D,
-        emb_B_D,
-        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-    ):
-        if self.use_adaln_lora:
-            assert adaln_lora_B_3D is not None
-            shift_B_D, scale_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D[:, : 2 * self.hidden_size]).chunk(
-                2, dim=1
-            )
-        else:
-            shift_B_D, scale_B_D = self.adaLN_modulation(emb_B_D).chunk(2, dim=1)
-
-        B = emb_B_D.shape[0]
-        T = x_BT_HW_D.shape[0] // B
-        shift_BT_D, scale_BT_D = repeat(shift_B_D, "b d -> (b t) d", t=T), repeat(scale_B_D, "b d -> (b t) d", t=T)
-        x_BT_HW_D = modulate(self.norm_final(x_BT_HW_D), shift_BT_D, scale_BT_D)
-
-        x_BT_HW_D = self.linear(x_BT_HW_D)
-        return x_BT_HW_D
-
-
-class VideoAttn(nn.Module):
-    """
-    Implements video attention with optional cross-attention capabilities.
-
-    This module processes video features while maintaining their spatio-temporal structure. It can perform
-    self-attention within the video features or cross-attention with external context features.
-
-    Parameters:
-        x_dim (int): Dimension of input feature vectors
-        context_dim (Optional[int]): Dimension of context features for cross-attention. None for self-attention
-        num_heads (int): Number of attention heads
-        bias (bool): Whether to include bias in attention projections. Default: False
-        qkv_norm_mode (str): Normalization mode for query/key/value projections. Must be "per_head". Default: "per_head"
-        x_format (str): Format of input tensor. Must be "BTHWD". Default: "BTHWD"
-
-    Input shape:
-        - x: (T, H, W, B, D) video features
-        - context (optional): (M, B, D) context features for cross-attention
-        where:
-            T: temporal dimension
-            H: height
-            W: width
-            B: batch size
-            D: feature dimension
-            M: context sequence length
-    """
-
-    def __init__(
-        self,
-        x_dim: int,
-        context_dim: Optional[int],
-        num_heads: int,
-        bias: bool = False,
-        qkv_norm_mode: str = "per_head",
-        x_format: str = "BTHWD",
-        weight_args={},
-        operations=None,
-    ) -> None:
-        super().__init__()
-        self.x_format = x_format
-
-        self.attn = Attention(
-            x_dim,
-            context_dim,
-            num_heads,
-            x_dim // num_heads,
-            qkv_bias=bias,
-            qkv_norm="RRI",
-            out_bias=bias,
-            qkv_norm_mode=qkv_norm_mode,
-            qkv_format="sbhd",
-            weight_args=weight_args,
-            operations=operations,
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        context: Optional[torch.Tensor] = None,
-        crossattn_mask: Optional[torch.Tensor] = None,
-        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Forward pass for video attention.
-
-        Args:
-            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D) representing batches of video data.
-            context (Tensor): Context tensor of shape (B, M, D) or (M, B, D),
-            where M is the sequence length of the context.
-            crossattn_mask (Optional[Tensor]): An optional mask for cross-attention mechanisms.
-            rope_emb_L_1_1_D (Optional[Tensor]):
-            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
-
-        Returns:
-            Tensor: The output tensor with applied attention, maintaining the input shape.
-        """
-
-        x_T_H_W_B_D = x
-        context_M_B_D = context
-        T, H, W, B, D = x_T_H_W_B_D.shape
-        x_THW_B_D = rearrange(x_T_H_W_B_D, "t h w b d -> (t h w) b d")
-        x_THW_B_D = self.attn(
-            x_THW_B_D,
-            context_M_B_D,
-            crossattn_mask,
-            rope_emb=rope_emb_L_1_1_D,
-        )
-        x_T_H_W_B_D = rearrange(x_THW_B_D, "(t h w) b d -> t h w b d", h=H, w=W)
-        return x_T_H_W_B_D
-
-
-def adaln_norm_state(norm_state, x, scale, shift):
-    normalized = norm_state(x)
-    return normalized * (1 + scale) + shift
-
-
-class DITBuildingBlock(nn.Module):
-    """
-    A building block for the DiT (Diffusion Transformer) architecture that supports different types of
-    attention and MLP operations with adaptive layer normalization.
-
-    Parameters:
-        block_type (str): Type of block - one of:
-            - "cross_attn"/"ca": Cross-attention
-            - "full_attn"/"fa": Full self-attention
-            - "mlp"/"ff": MLP/feedforward block
-        x_dim (int): Dimension of input features
-        context_dim (Optional[int]): Dimension of context features for cross-attention
-        num_heads (int): Number of attention heads
-        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
-        bias (bool): Whether to use bias in layers. Default: False
-        mlp_dropout (float): Dropout rate for MLP. Default: 0.0
-        qkv_norm_mode (str): QKV normalization mode. Default: "per_head"
-        x_format (str): Input tensor format. Default: "BTHWD"
-        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
-        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
-    """
-
-    def __init__(
-        self,
-        block_type: str,
-        x_dim: int,
-        context_dim: Optional[int],
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        bias: bool = False,
-        mlp_dropout: float = 0.0,
-        qkv_norm_mode: str = "per_head",
-        x_format: str = "BTHWD",
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        weight_args={},
-        operations=None
-    ) -> None:
-        block_type = block_type.lower()
-
-        super().__init__()
-        self.x_format = x_format
-        if block_type in ["cross_attn", "ca"]:
-            self.block = VideoAttn(
-                x_dim,
-                context_dim,
-                num_heads,
-                bias=bias,
-                qkv_norm_mode=qkv_norm_mode,
-                x_format=self.x_format,
-                weight_args=weight_args,
-                operations=operations,
-            )
-        elif block_type in ["full_attn", "fa"]:
-            self.block = VideoAttn(
-                x_dim, None, num_heads, bias=bias, qkv_norm_mode=qkv_norm_mode, x_format=self.x_format, weight_args=weight_args, operations=operations
-            )
-        elif block_type in ["mlp", "ff"]:
-            self.block = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), dropout=mlp_dropout, bias=bias, weight_args=weight_args, operations=operations)
-        else:
-            raise ValueError(f"Unknown block type: {block_type}")
-
-        self.block_type = block_type
-        self.use_adaln_lora = use_adaln_lora
-
-        self.norm_state = nn.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6)
-        self.n_adaln_chunks = 3
-        if use_adaln_lora:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(),
-                operations.Linear(x_dim, adaln_lora_dim, bias=False, **weight_args),
-                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * x_dim, bias=False, **weight_args),
-            )
-        else:
-            self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, self.n_adaln_chunks * x_dim, bias=False, **weight_args))
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        emb_B_D: torch.Tensor,
-        crossattn_emb: torch.Tensor,
-        crossattn_mask: Optional[torch.Tensor] = None,
-        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
-        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Forward pass for dynamically configured blocks with adaptive normalization.
-
-        Args:
-            x (Tensor): Input tensor of shape (B, T, H, W, D) or (T, H, W, B, D).
-            emb_B_D (Tensor): Embedding tensor for adaptive layer normalization modulation.
-            crossattn_emb (Tensor): Tensor for cross-attention blocks.
-            crossattn_mask (Optional[Tensor]): Optional mask for cross-attention.
-            rope_emb_L_1_1_D (Optional[Tensor]):
-            Rotary positional embedding tensor of shape (L, 1, 1, D). L == THW for current video training.
-
-        Returns:
-            Tensor: The output tensor after processing through the configured block and adaptive normalization.
-        """
-        if self.use_adaln_lora:
-            shift_B_D, scale_B_D, gate_B_D = (self.adaLN_modulation(emb_B_D) + adaln_lora_B_3D).chunk(
-                self.n_adaln_chunks, dim=1
-            )
-        else:
-            shift_B_D, scale_B_D, gate_B_D = self.adaLN_modulation(emb_B_D).chunk(self.n_adaln_chunks, dim=1)
-
-        shift_1_1_1_B_D, scale_1_1_1_B_D, gate_1_1_1_B_D = (
-            shift_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
-            scale_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
-            gate_B_D.unsqueeze(0).unsqueeze(0).unsqueeze(0),
-        )
-
-        if self.block_type in ["mlp", "ff"]:
-            x = x + gate_1_1_1_B_D * self.block(
-                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
-            )
-        elif self.block_type in ["full_attn", "fa"]:
-            x = x + gate_1_1_1_B_D * self.block(
-                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
-                context=None,
-                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-            )
-        elif self.block_type in ["cross_attn", "ca"]:
-            x = x + gate_1_1_1_B_D * self.block(
-                adaln_norm_state(self.norm_state, x, scale_1_1_1_B_D, shift_1_1_1_B_D),
-                context=crossattn_emb,
-                crossattn_mask=crossattn_mask,
-                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-            )
-        else:
-            raise ValueError(f"Unknown block type: {self.block_type}")
-
-        return x
-
-
-class GeneralDITTransformerBlock(nn.Module):
-    """
-    A wrapper module that manages a sequence of DITBuildingBlocks to form a complete transformer layer.
-    Each block in the sequence is specified by a block configuration string.
-
-    Parameters:
-        x_dim (int): Dimension of input features
-        context_dim (int): Dimension of context features for cross-attention blocks
-        num_heads (int): Number of attention heads
-        block_config (str): String specifying block sequence (e.g. "ca-fa-mlp" for cross-attention,
-                          full-attention, then MLP)
-        mlp_ratio (float): MLP hidden dimension multiplier. Default: 4.0
-        x_format (str): Input tensor format. Default: "BTHWD"
-        use_adaln_lora (bool): Whether to use AdaLN-LoRA. Default: False
-        adaln_lora_dim (int): Dimension for AdaLN-LoRA. Default: 256
-
-    The block_config string uses "-" to separate block types:
-        - "ca"/"cross_attn": Cross-attention block
-        - "fa"/"full_attn": Full self-attention block
-        - "mlp"/"ff": MLP/feedforward block
-
-    Example:
-        block_config = "ca-fa-mlp" creates a sequence of:
-        1. Cross-attention block
-        2. Full self-attention block
-        3. MLP block
-    """
-
-    def __init__(
-        self,
-        x_dim: int,
-        context_dim: int,
-        num_heads: int,
-        block_config: str,
-        mlp_ratio: float = 4.0,
-        x_format: str = "BTHWD",
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        weight_args={},
-        operations=None
-    ):
-        super().__init__()
-        self.blocks = nn.ModuleList()
-        self.x_format = x_format
-        for block_type in block_config.split("-"):
-            self.blocks.append(
-                DITBuildingBlock(
-                    block_type,
-                    x_dim,
-                    context_dim,
-                    num_heads,
-                    mlp_ratio,
-                    x_format=self.x_format,
-                    use_adaln_lora=use_adaln_lora,
-                    adaln_lora_dim=adaln_lora_dim,
-                    weight_args=weight_args,
-                    operations=operations,
-                )
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        emb_B_D: torch.Tensor,
-        crossattn_emb: torch.Tensor,
-        crossattn_mask: Optional[torch.Tensor] = None,
-        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
-        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        for block in self.blocks:
-            x = block(
-                x,
-                emb_B_D,
-                crossattn_emb,
-                crossattn_mask,
-                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-                adaln_lora_B_3D=adaln_lora_B_3D,
-            )
-        return x
--- a/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/layers3d.py
--- a/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/patching.py
@ -1,377 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The patcher and unpatcher implementation for 2D and 3D data.
-
-The idea of Haar wavelet is to compute LL, LH, HL, HH component as two 1D convolutions.
-One on the rows and one on the columns.
-For example, in 1D signal, we have [a, b], then the low-freq compoenent is [a + b] / 2 and high-freq is [a - b] / 2.
-We can use a 1D convolution with kernel [1, 1] and stride 2 to represent the L component.
-For H component, we can use a 1D convolution with kernel [1, -1] and stride 2.
-Although in principle, we typically only do additional Haar wavelet over the LL component. But here we do it for all
-   as we need to support downsampling for more than 2x.
-For example, 4x downsampling can be done by 2x Haar and additional 2x Haar, and the shape would be.
-   [3, 256, 256] -> [12, 128, 128] -> [48, 64, 64]
-"""
-
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-
-_WAVELETS = {
-    "haar": torch.tensor([0.7071067811865476, 0.7071067811865476]),
-    "rearrange": torch.tensor([1.0, 1.0]),
-}
-_PERSISTENT = False
-
-
-class Patcher(torch.nn.Module):
-    """A module to convert image tensors into patches using torch operations.
-
-    The main difference from `class Patching` is that this module implements
-    all operations using torch, rather than python or numpy, for efficiency purpose.
-
-    It's bit-wise identical to the Patching module outputs, with the added
-    benefit of being torch.jit scriptable.
-    """
-
-    def __init__(self, patch_size=1, patch_method="haar"):
-        super().__init__()
-        self.patch_size = patch_size
-        self.patch_method = patch_method
-        self.register_buffer(
-            "wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT
-        )
-        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
-        self.register_buffer(
-            "_arange",
-            torch.arange(_WAVELETS[patch_method].shape[0]),
-            persistent=_PERSISTENT,
-        )
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, x):
-        if self.patch_method == "haar":
-            return self._haar(x)
-        elif self.patch_method == "rearrange":
-            return self._arrange(x)
-        else:
-            raise ValueError("Unknown patch method: " + self.patch_method)
-
-    def _dwt(self, x, mode="reflect", rescale=False):
-        dtype = x.dtype
-        h = self.wavelets.to(device=x.device)
-
-        n = h.shape[0]
-        g = x.shape[1]
-        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
-        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
-        hh = hh.to(dtype=dtype)
-        hl = hl.to(dtype=dtype)
-
-        x = F.pad(x, pad=(n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
-        xl = F.conv2d(x, hl.unsqueeze(2), groups=g, stride=(1, 2))
-        xh = F.conv2d(x, hh.unsqueeze(2), groups=g, stride=(1, 2))
-        xll = F.conv2d(xl, hl.unsqueeze(3), groups=g, stride=(2, 1))
-        xlh = F.conv2d(xl, hh.unsqueeze(3), groups=g, stride=(2, 1))
-        xhl = F.conv2d(xh, hl.unsqueeze(3), groups=g, stride=(2, 1))
-        xhh = F.conv2d(xh, hh.unsqueeze(3), groups=g, stride=(2, 1))
-
-        out = torch.cat([xll, xlh, xhl, xhh], dim=1)
-        if rescale:
-            out = out / 2
-        return out
-
-    def _haar(self, x):
-        for _ in self.range:
-            x = self._dwt(x, rescale=True)
-        return x
-
-    def _arrange(self, x):
-        x = rearrange(
-            x,
-            "b c (h p1) (w p2) -> b (c p1 p2) h w",
-            p1=self.patch_size,
-            p2=self.patch_size,
-        ).contiguous()
-        return x
-
-
-class Patcher3D(Patcher):
-    """A 3D discrete wavelet transform for video data, expects 5D tensor, i.e. a batch of videos."""
-
-    def __init__(self, patch_size=1, patch_method="haar"):
-        super().__init__(patch_method=patch_method, patch_size=patch_size)
-        self.register_buffer(
-            "patch_size_buffer",
-            patch_size * torch.ones([1], dtype=torch.int32),
-            persistent=_PERSISTENT,
-        )
-
-    def _dwt(self, x, wavelet, mode="reflect", rescale=False):
-        dtype = x.dtype
-        h = self.wavelets.to(device=x.device)
-
-        n = h.shape[0]
-        g = x.shape[1]
-        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
-        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
-        hh = hh.to(dtype=dtype)
-        hl = hl.to(dtype=dtype)
-
-        # Handles temporal axis.
-        x = F.pad(
-            x, pad=(max(0, n - 2), n - 1, n - 2, n - 1, n - 2, n - 1), mode=mode
-        ).to(dtype)
-        xl = F.conv3d(x, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
-        xh = F.conv3d(x, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
-
-        # Handles spatial axes.
-        xll = F.conv3d(xl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
-        xlh = F.conv3d(xl, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
-        xhl = F.conv3d(xh, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
-        xhh = F.conv3d(xh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
-
-        xlll = F.conv3d(xll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xllh = F.conv3d(xll, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xlhl = F.conv3d(xlh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xlhh = F.conv3d(xlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xhll = F.conv3d(xhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xhlh = F.conv3d(xhl, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xhhl = F.conv3d(xhh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-        xhhh = F.conv3d(xhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
-
-        out = torch.cat([xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh], dim=1)
-        if rescale:
-            out = out / (2 * torch.sqrt(torch.tensor(2.0)))
-        return out
-
-    def _haar(self, x):
-        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
-        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
-        for _ in self.range:
-            x = self._dwt(x, "haar", rescale=True)
-        return x
-
-    def _arrange(self, x):
-        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
-        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
-        x = rearrange(
-            x,
-            "b c (t p1) (h p2) (w p3) -> b (c p1 p2 p3) t h w",
-            p1=self.patch_size,
-            p2=self.patch_size,
-            p3=self.patch_size,
-        ).contiguous()
-        return x
-
-
-class UnPatcher(torch.nn.Module):
-    """A module to convert patches into image tensorsusing torch operations.
-
-    The main difference from `class Unpatching` is that this module implements
-    all operations using torch, rather than python or numpy, for efficiency purpose.
-
-    It's bit-wise identical to the Unpatching module outputs, with the added
-    benefit of being torch.jit scriptable.
-    """
-
-    def __init__(self, patch_size=1, patch_method="haar"):
-        super().__init__()
-        self.patch_size = patch_size
-        self.patch_method = patch_method
-        self.register_buffer(
-            "wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT
-        )
-        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
-        self.register_buffer(
-            "_arange",
-            torch.arange(_WAVELETS[patch_method].shape[0]),
-            persistent=_PERSISTENT,
-        )
-        for param in self.parameters():
-            param.requires_grad = False
-
-    def forward(self, x):
-        if self.patch_method == "haar":
-            return self._ihaar(x)
-        elif self.patch_method == "rearrange":
-            return self._iarrange(x)
-        else:
-            raise ValueError("Unknown patch method: " + self.patch_method)
-
-    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
-        dtype = x.dtype
-        h = self.wavelets.to(device=x.device)
-        n = h.shape[0]
-
-        g = x.shape[1] // 4
-        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
-        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
-        hh = hh.to(dtype=dtype)
-        hl = hl.to(dtype=dtype)
-
-        xll, xlh, xhl, xhh = torch.chunk(x.to(dtype), 4, dim=1)
-
-        # Inverse transform.
-        yl = torch.nn.functional.conv_transpose2d(
-            xll, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
-        )
-        yl += torch.nn.functional.conv_transpose2d(
-            xlh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
-        )
-        yh = torch.nn.functional.conv_transpose2d(
-            xhl, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
-        )
-        yh += torch.nn.functional.conv_transpose2d(
-            xhh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0)
-        )
-        y = torch.nn.functional.conv_transpose2d(
-            yl, hl.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
-        )
-        y += torch.nn.functional.conv_transpose2d(
-            yh, hh.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2)
-        )
-
-        if rescale:
-            y = y * 2
-        return y
-
-    def _ihaar(self, x):
-        for _ in self.range:
-            x = self._idwt(x, "haar", rescale=True)
-        return x
-
-    def _iarrange(self, x):
-        x = rearrange(
-            x,
-            "b (c p1 p2) h w -> b c (h p1) (w p2)",
-            p1=self.patch_size,
-            p2=self.patch_size,
-        )
-        return x
-
-
-class UnPatcher3D(UnPatcher):
-    """A 3D inverse discrete wavelet transform for video wavelet decompositions."""
-
-    def __init__(self, patch_size=1, patch_method="haar"):
-        super().__init__(patch_method=patch_method, patch_size=patch_size)
-
-    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
-        dtype = x.dtype
-        h = self.wavelets.to(device=x.device)
-
-        g = x.shape[1] // 8  # split into 8 spatio-temporal filtered tesnors.
-        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
-        hh = (h * ((-1) ** self._arange.to(device=x.device))).reshape(1, 1, -1).repeat(g, 1, 1)
-        hl = hl.to(dtype=dtype)
-        hh = hh.to(dtype=dtype)
-
-        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
-        del x
-
-        # Height height transposed convolutions.
-        xll = F.conv_transpose3d(
-            xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xlll
-
-        xll += F.conv_transpose3d(
-            xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xllh
-
-        xlh = F.conv_transpose3d(
-            xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xlhl
-
-        xlh += F.conv_transpose3d(
-            xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xlhh
-
-        xhl = F.conv_transpose3d(
-            xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xhll
-
-        xhl += F.conv_transpose3d(
-            xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xhlh
-
-        xhh = F.conv_transpose3d(
-            xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xhhl
-
-        xhh += F.conv_transpose3d(
-            xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2)
-        )
-        del xhhh
-
-        # Handles width transposed convolutions.
-        xl = F.conv_transpose3d(
-            xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
-        )
-        del xll
-
-        xl += F.conv_transpose3d(
-            xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
-        )
-        del xlh
-
-        xh = F.conv_transpose3d(
-            xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
-        )
-        del xhl
-
-        xh += F.conv_transpose3d(
-            xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1)
-        )
-        del xhh
-
-        # Handles time axis transposed convolutions.
-        x = F.conv_transpose3d(
-            xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
-        )
-        del xl
-
-        x += F.conv_transpose3d(
-            xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1)
-        )
-
-        if rescale:
-            x = x * (2 * torch.sqrt(torch.tensor(2.0)))
-        return x
-
-    def _ihaar(self, x):
-        for _ in self.range:
-            x = self._idwt(x, "haar", rescale=True)
-        x = x[:, :, self.patch_size - 1 :, ...]
-        return x
-
-    def _iarrange(self, x):
-        x = rearrange(
-            x,
-            "b (c p1 p2 p3) t h w -> b c (t p1) (h p2) (w p3)",
-            p1=self.patch_size,
-            p2=self.patch_size,
-            p3=self.patch_size,
-        )
-        x = x[:, :, self.patch_size - 1 :, ...]
-        return x
--- a/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
+++ b/comfy/ldm/cosmos/cosmos_tokenizer/utils.py
@ -1,112 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Shared utilities for the networks module."""
-
-from typing import Any
-
-import torch
-from einops import rearrange
-
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-def time2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
-    batch_size = x.shape[0]
-    return rearrange(x, "b c t h w -> (b t) c h w"), batch_size
-
-
-def batch2time(x: torch.Tensor, batch_size: int) -> torch.Tensor:
-    return rearrange(x, "(b t) c h w -> b c t h w", b=batch_size)
-
-
-def space2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
-    batch_size, height = x.shape[0], x.shape[-2]
-    return rearrange(x, "b c t h w -> (b h w) c t"), batch_size, height
-
-
-def batch2space(x: torch.Tensor, batch_size: int, height: int) -> torch.Tensor:
-    return rearrange(x, "(b h w) c t -> b c t h w", b=batch_size, h=height)
-
-
-def cast_tuple(t: Any, length: int = 1) -> Any:
-    return t if isinstance(t, tuple) else ((t,) * length)
-
-
-def replication_pad(x):
-    return torch.cat([x[:, :, :1, ...], x], dim=2)
-
-
-def divisible_by(num: int, den: int) -> bool:
-    return (num % den) == 0
-
-
-def is_odd(n: int) -> bool:
-    return not divisible_by(n, 2)
-
-
-def nonlinearity(x):
-    return x * torch.sigmoid(x)
-
-
-def Normalize(in_channels, num_groups=32):
-    return ops.GroupNorm(
-        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
-    )
-
-
-class CausalNormalize(torch.nn.Module):
-    def __init__(self, in_channels, num_groups=1):
-        super().__init__()
-        self.norm = ops.GroupNorm(
-            num_groups=num_groups,
-            num_channels=in_channels,
-            eps=1e-6,
-            affine=True,
-        )
-        self.num_groups = num_groups
-
-    def forward(self, x):
-        # if num_groups !=1, we apply a spatio-temporal groupnorm for backward compatibility purpose.
-        # All new models should use num_groups=1, otherwise causality is not guaranteed.
-        if self.num_groups == 1:
-            x, batch_size = time2batch(x)
-            return batch2time(self.norm(x), batch_size)
-        return self.norm(x)
-
-
-def exists(v):
-    return v is not None
-
-
-def default(*args):
-    for arg in args:
-        if exists(arg):
-            return arg
-    return None
-
-
-def round_ste(z: torch.Tensor) -> torch.Tensor:
-    """Round with straight through gradients."""
-    zhat = z.round()
-    return z + (zhat - z).detach()
-
-
-def log(t, eps=1e-5):
-    return t.clamp(min=eps).log()
-
-
-def entropy(prob):
-    return (-prob * log(prob)).sum(dim=-1)
--- a/comfy/ldm/cosmos/model.py
+++ b/comfy/ldm/cosmos/model.py
@ -1,514 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
-"""
-
-from typing import Optional, Tuple
-
-import torch
-from einops import rearrange
-from torch import nn
-from torchvision import transforms
-
-from enum import Enum
-import logging
-
-from comfy.ldm.modules.diffusionmodules.mmdit import RMSNorm
-
-from .blocks import (
-    FinalLayer,
-    GeneralDITTransformerBlock,
-    PatchEmbed,
-    TimestepEmbedding,
-    Timesteps,
-)
-
-from .position_embedding import LearnablePosEmbAxis, VideoRopePosition3DEmb
-
-
-class DataType(Enum):
-    IMAGE = "image"
-    VIDEO = "video"
-
-
-class GeneralDIT(nn.Module):
-    """
-    A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
-
-    Args:
-        max_img_h (int): Maximum height of the input images.
-        max_img_w (int): Maximum width of the input images.
-        max_frames (int): Maximum number of frames in the video sequence.
-        in_channels (int): Number of input channels (e.g., RGB channels for color images).
-        out_channels (int): Number of output channels.
-        patch_spatial (tuple): Spatial resolution of patches for input processing.
-        patch_temporal (int): Temporal resolution of patches for input processing.
-        concat_padding_mask (bool): If True, includes a mask channel in the input to handle padding.
-        block_config (str): Configuration of the transformer block. See Notes for supported block types.
-        model_channels (int): Base number of channels used throughout the model.
-        num_blocks (int): Number of transformer blocks.
-        num_heads (int): Number of heads in the multi-head attention layers.
-        mlp_ratio (float): Expansion ratio for MLP blocks.
-        block_x_format (str): Format of input tensor for transformer blocks ('BTHWD' or 'THWBD').
-        crossattn_emb_channels (int): Number of embedding channels for cross-attention.
-        use_cross_attn_mask (bool): Whether to use mask in cross-attention.
-        pos_emb_cls (str): Type of positional embeddings.
-        pos_emb_learnable (bool): Whether positional embeddings are learnable.
-        pos_emb_interpolation (str): Method for interpolating positional embeddings.
-        affline_emb_norm (bool): Whether to normalize affine embeddings.
-        use_adaln_lora (bool): Whether to use AdaLN-LoRA.
-        adaln_lora_dim (int): Dimension for AdaLN-LoRA.
-        rope_h_extrapolation_ratio (float): Height extrapolation ratio for RoPE.
-        rope_w_extrapolation_ratio (float): Width extrapolation ratio for RoPE.
-        rope_t_extrapolation_ratio (float): Temporal extrapolation ratio for RoPE.
-        extra_per_block_abs_pos_emb (bool): Whether to use extra per-block absolute positional embeddings.
-        extra_per_block_abs_pos_emb_type (str): Type of extra per-block positional embeddings.
-        extra_h_extrapolation_ratio (float): Height extrapolation ratio for extra embeddings.
-        extra_w_extrapolation_ratio (float): Width extrapolation ratio for extra embeddings.
-        extra_t_extrapolation_ratio (float): Temporal extrapolation ratio for extra embeddings.
-
-    Notes:
-        Supported block types in block_config:
-        * cross_attn, ca: Cross attention
-        * full_attn: Full attention on all flattened tokens
-        * mlp, ff: Feed forward block
-    """
-
-    def __init__(
-        self,
-        max_img_h: int,
-        max_img_w: int,
-        max_frames: int,
-        in_channels: int,
-        out_channels: int,
-        patch_spatial: tuple,
-        patch_temporal: int,
-        concat_padding_mask: bool = True,
-        # attention settings
-        block_config: str = "FA-CA-MLP",
-        model_channels: int = 768,
-        num_blocks: int = 10,
-        num_heads: int = 16,
-        mlp_ratio: float = 4.0,
-        block_x_format: str = "BTHWD",
-        # cross attention settings
-        crossattn_emb_channels: int = 1024,
-        use_cross_attn_mask: bool = False,
-        # positional embedding settings
-        pos_emb_cls: str = "sincos",
-        pos_emb_learnable: bool = False,
-        pos_emb_interpolation: str = "crop",
-        affline_emb_norm: bool = False,  # whether or not to normalize the affine embedding
-        use_adaln_lora: bool = False,
-        adaln_lora_dim: int = 256,
-        rope_h_extrapolation_ratio: float = 1.0,
-        rope_w_extrapolation_ratio: float = 1.0,
-        rope_t_extrapolation_ratio: float = 1.0,
-        extra_per_block_abs_pos_emb: bool = False,
-        extra_per_block_abs_pos_emb_type: str = "sincos",
-        extra_h_extrapolation_ratio: float = 1.0,
-        extra_w_extrapolation_ratio: float = 1.0,
-        extra_t_extrapolation_ratio: float = 1.0,
-        image_model=None,
-        device=None,
-        dtype=None,
-        operations=None,
-    ) -> None:
-        super().__init__()
-        self.max_img_h = max_img_h
-        self.max_img_w = max_img_w
-        self.max_frames = max_frames
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.patch_spatial = patch_spatial
-        self.patch_temporal = patch_temporal
-        self.num_heads = num_heads
-        self.num_blocks = num_blocks
-        self.model_channels = model_channels
-        self.use_cross_attn_mask = use_cross_attn_mask
-        self.concat_padding_mask = concat_padding_mask
-        # positional embedding settings
-        self.pos_emb_cls = pos_emb_cls
-        self.pos_emb_learnable = pos_emb_learnable
-        self.pos_emb_interpolation = pos_emb_interpolation
-        self.affline_emb_norm = affline_emb_norm
-        self.rope_h_extrapolation_ratio = rope_h_extrapolation_ratio
-        self.rope_w_extrapolation_ratio = rope_w_extrapolation_ratio
-        self.rope_t_extrapolation_ratio = rope_t_extrapolation_ratio
-        self.extra_per_block_abs_pos_emb = extra_per_block_abs_pos_emb
-        self.extra_per_block_abs_pos_emb_type = extra_per_block_abs_pos_emb_type.lower()
-        self.extra_h_extrapolation_ratio = extra_h_extrapolation_ratio
-        self.extra_w_extrapolation_ratio = extra_w_extrapolation_ratio
-        self.extra_t_extrapolation_ratio = extra_t_extrapolation_ratio
-        self.dtype = dtype
-        weight_args = {"device": device, "dtype": dtype}
-
-        in_channels = in_channels + 1 if concat_padding_mask else in_channels
-        self.x_embedder = PatchEmbed(
-            spatial_patch_size=patch_spatial,
-            temporal_patch_size=patch_temporal,
-            in_channels=in_channels,
-            out_channels=model_channels,
-            bias=False,
-            weight_args=weight_args,
-            operations=operations,
-        )
-
-        self.build_pos_embed(device=device, dtype=dtype)
-        self.block_x_format = block_x_format
-        self.use_adaln_lora = use_adaln_lora
-        self.adaln_lora_dim = adaln_lora_dim
-        self.t_embedder = nn.ModuleList(
-            [Timesteps(model_channels),
-             TimestepEmbedding(model_channels, model_channels, use_adaln_lora=use_adaln_lora, weight_args=weight_args, operations=operations),]
-        )
-
-        self.blocks = nn.ModuleDict()
-
-        for idx in range(num_blocks):
-            self.blocks[f"block{idx}"] = GeneralDITTransformerBlock(
-                x_dim=model_channels,
-                context_dim=crossattn_emb_channels,
-                num_heads=num_heads,
-                block_config=block_config,
-                mlp_ratio=mlp_ratio,
-                x_format=self.block_x_format,
-                use_adaln_lora=use_adaln_lora,
-                adaln_lora_dim=adaln_lora_dim,
-                weight_args=weight_args,
-                operations=operations,
-            )
-
-        if self.affline_emb_norm:
-            logging.debug("Building affine embedding normalization layer")
-            self.affline_norm = RMSNorm(model_channels, elementwise_affine=True, eps=1e-6)
-        else:
-            self.affline_norm = nn.Identity()
-
-        self.final_layer = FinalLayer(
-            hidden_size=self.model_channels,
-            spatial_patch_size=self.patch_spatial,
-            temporal_patch_size=self.patch_temporal,
-            out_channels=self.out_channels,
-            use_adaln_lora=self.use_adaln_lora,
-            adaln_lora_dim=self.adaln_lora_dim,
-            weight_args=weight_args,
-            operations=operations,
-        )
-
-    def build_pos_embed(self, device=None, dtype=None):
-        if self.pos_emb_cls == "rope3d":
-            cls_type = VideoRopePosition3DEmb
-        else:
-            raise ValueError(f"Unknown pos_emb_cls {self.pos_emb_cls}")
-
-        logging.debug(f"Building positional embedding with {self.pos_emb_cls} class, impl {cls_type}")
-        kwargs = dict(
-            model_channels=self.model_channels,
-            len_h=self.max_img_h // self.patch_spatial,
-            len_w=self.max_img_w // self.patch_spatial,
-            len_t=self.max_frames // self.patch_temporal,
-            is_learnable=self.pos_emb_learnable,
-            interpolation=self.pos_emb_interpolation,
-            head_dim=self.model_channels // self.num_heads,
-            h_extrapolation_ratio=self.rope_h_extrapolation_ratio,
-            w_extrapolation_ratio=self.rope_w_extrapolation_ratio,
-            t_extrapolation_ratio=self.rope_t_extrapolation_ratio,
-            device=device,
-        )
-        self.pos_embedder = cls_type(
-            **kwargs,
-        )
-
-        if self.extra_per_block_abs_pos_emb:
-            assert self.extra_per_block_abs_pos_emb_type in [
-                "learnable",
-            ], f"Unknown extra_per_block_abs_pos_emb_type {self.extra_per_block_abs_pos_emb_type}"
-            kwargs["h_extrapolation_ratio"] = self.extra_h_extrapolation_ratio
-            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
-            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
-            kwargs["device"] = device
-            kwargs["dtype"] = dtype
-            self.extra_pos_embedder = LearnablePosEmbAxis(
-                **kwargs,
-            )
-
-    def prepare_embedded_sequence(
-        self,
-        x_B_C_T_H_W: torch.Tensor,
-        fps: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        latent_condition: Optional[torch.Tensor] = None,
-        latent_condition_sigma: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
-
-        Args:
-            x_B_C_T_H_W (torch.Tensor): video
-            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
-                                    If None, a default value (`self.base_fps`) will be used.
-            padding_mask (Optional[torch.Tensor]): current it is not used
-
-        Returns:
-            Tuple[torch.Tensor, Optional[torch.Tensor]]:
-                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
-                - An optional positional embedding tensor, returned only if the positional embedding class
-                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
-
-        Notes:
-            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
-            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
-            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
-                the `self.pos_embedder` with the shape [T, H, W].
-            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the
-            `self.pos_embedder` with the fps tensor.
-            - Otherwise, the positional embeddings are generated without considering fps.
-        """
-        if self.concat_padding_mask:
-            if padding_mask is not None:
-                padding_mask = transforms.functional.resize(
-                    padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
-                )
-            else:
-                padding_mask = torch.zeros((x_B_C_T_H_W.shape[0], 1, x_B_C_T_H_W.shape[-2], x_B_C_T_H_W.shape[-1]), dtype=x_B_C_T_H_W.dtype, device=x_B_C_T_H_W.device)
-
-            x_B_C_T_H_W = torch.cat(
-                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
-            )
-        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
-
-        if self.extra_per_block_abs_pos_emb:
-            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
-        else:
-            extra_pos_emb = None
-
-        if "rope" in self.pos_emb_cls.lower():
-            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device), extra_pos_emb
-
-        if "fps_aware" in self.pos_emb_cls:
-            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
-        else:
-            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
-
-        return x_B_T_H_W_D, None, extra_pos_emb
-
-    def decoder_head(
-        self,
-        x_B_T_H_W_D: torch.Tensor,
-        emb_B_D: torch.Tensor,
-        crossattn_emb: torch.Tensor,
-        origin_shape: Tuple[int, int, int, int, int],  # [B, C, T, H, W]
-        crossattn_mask: Optional[torch.Tensor] = None,
-        adaln_lora_B_3D: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        del crossattn_emb, crossattn_mask
-        B, C, T_before_patchify, H_before_patchify, W_before_patchify = origin_shape
-        x_BT_HW_D = rearrange(x_B_T_H_W_D, "B T H W D -> (B T) (H W) D")
-        x_BT_HW_D = self.final_layer(x_BT_HW_D, emb_B_D, adaln_lora_B_3D=adaln_lora_B_3D)
-        # This is to ensure x_BT_HW_D has the correct shape because
-        # when we merge T, H, W into one dimension, x_BT_HW_D has shape (B * T * H * W, 1*1, D).
-        x_BT_HW_D = x_BT_HW_D.view(
-            B * T_before_patchify // self.patch_temporal,
-            H_before_patchify // self.patch_spatial * W_before_patchify // self.patch_spatial,
-            -1,
-        )
-        x_B_D_T_H_W = rearrange(
-            x_BT_HW_D,
-            "(B T) (H W) (p1 p2 t C) -> B C (T t) (H p1) (W p2)",
-            p1=self.patch_spatial,
-            p2=self.patch_spatial,
-            H=H_before_patchify // self.patch_spatial,
-            W=W_before_patchify // self.patch_spatial,
-            t=self.patch_temporal,
-            B=B,
-        )
-        return x_B_D_T_H_W
-
-    def forward_before_blocks(
-        self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        crossattn_emb: torch.Tensor,
-        crossattn_mask: Optional[torch.Tensor] = None,
-        fps: Optional[torch.Tensor] = None,
-        image_size: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        scalar_feature: Optional[torch.Tensor] = None,
-        data_type: Optional[DataType] = DataType.VIDEO,
-        latent_condition: Optional[torch.Tensor] = None,
-        latent_condition_sigma: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x: (B, C, T, H, W) tensor of spatial-temp inputs
-            timesteps: (B, ) tensor of timesteps
-            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
-            crossattn_mask: (B, N) tensor of cross-attention masks
-        """
-        del kwargs
-        assert isinstance(
-            data_type, DataType
-        ), f"Expected DataType, got {type(data_type)}. We need discuss this flag later."
-        original_shape = x.shape
-        x_B_T_H_W_D, rope_emb_L_1_1_D, extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = self.prepare_embedded_sequence(
-            x,
-            fps=fps,
-            padding_mask=padding_mask,
-            latent_condition=latent_condition,
-            latent_condition_sigma=latent_condition_sigma,
-        )
-        # logging affline scale information
-        affline_scale_log_info = {}
-
-        timesteps_B_D, adaln_lora_B_3D = self.t_embedder[1](self.t_embedder[0](timesteps.flatten()).to(x.dtype))
-        affline_emb_B_D = timesteps_B_D
-        affline_scale_log_info["timesteps_B_D"] = timesteps_B_D.detach()
-
-        if scalar_feature is not None:
-            raise NotImplementedError("Scalar feature is not implemented yet.")
-
-        affline_scale_log_info["affline_emb_B_D"] = affline_emb_B_D.detach()
-        affline_emb_B_D = self.affline_norm(affline_emb_B_D)
-
-        if self.use_cross_attn_mask:
-            if crossattn_mask is not None and not torch.is_floating_point(crossattn_mask):
-                crossattn_mask = (crossattn_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
-            crossattn_mask = crossattn_mask[:, None, None, :]  # .to(dtype=torch.bool)  # [B, 1, 1, length]
-        else:
-            crossattn_mask = None
-
-        if self.blocks["block0"].x_format == "THWBD":
-            x = rearrange(x_B_T_H_W_D, "B T H W D -> T H W B D")
-            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-                extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = rearrange(
-                    extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D, "B T H W D -> T H W B D"
-                )
-            crossattn_emb = rearrange(crossattn_emb, "B M D -> M B D")
-
-            if crossattn_mask:
-                crossattn_mask = rearrange(crossattn_mask, "B M -> M B")
-
-        elif self.blocks["block0"].x_format == "BTHWD":
-            x = x_B_T_H_W_D
-        else:
-            raise ValueError(f"Unknown x_format {self.blocks[0].x_format}")
-        output = {
-            "x": x,
-            "affline_emb_B_D": affline_emb_B_D,
-            "crossattn_emb": crossattn_emb,
-            "crossattn_mask": crossattn_mask,
-            "rope_emb_L_1_1_D": rope_emb_L_1_1_D,
-            "adaln_lora_B_3D": adaln_lora_B_3D,
-            "original_shape": original_shape,
-            "extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
-        }
-        return output
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        timesteps: torch.Tensor,
-        context: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        # crossattn_emb: torch.Tensor,
-        # crossattn_mask: Optional[torch.Tensor] = None,
-        fps: Optional[torch.Tensor] = None,
-        image_size: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        scalar_feature: Optional[torch.Tensor] = None,
-        data_type: Optional[DataType] = DataType.VIDEO,
-        latent_condition: Optional[torch.Tensor] = None,
-        latent_condition_sigma: Optional[torch.Tensor] = None,
-        condition_video_augment_sigma: Optional[torch.Tensor] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            x: (B, C, T, H, W) tensor of spatial-temp inputs
-            timesteps: (B, ) tensor of timesteps
-            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
-            crossattn_mask: (B, N) tensor of cross-attention masks
-            condition_video_augment_sigma: (B,) used in lvg(long video generation), we add noise with this sigma to
-                augment condition input, the lvg model will condition on the condition_video_augment_sigma value;
-                we need forward_before_blocks pass to the forward_before_blocks function.
-        """
-
-        crossattn_emb = context
-        crossattn_mask = attention_mask
-
-        inputs = self.forward_before_blocks(
-            x=x,
-            timesteps=timesteps,
-            crossattn_emb=crossattn_emb,
-            crossattn_mask=crossattn_mask,
-            fps=fps,
-            image_size=image_size,
-            padding_mask=padding_mask,
-            scalar_feature=scalar_feature,
-            data_type=data_type,
-            latent_condition=latent_condition,
-            latent_condition_sigma=latent_condition_sigma,
-            condition_video_augment_sigma=condition_video_augment_sigma,
-            **kwargs,
-        )
-        x, affline_emb_B_D, crossattn_emb, crossattn_mask, rope_emb_L_1_1_D, adaln_lora_B_3D, original_shape = (
-            inputs["x"],
-            inputs["affline_emb_B_D"],
-            inputs["crossattn_emb"],
-            inputs["crossattn_mask"],
-            inputs["rope_emb_L_1_1_D"],
-            inputs["adaln_lora_B_3D"],
-            inputs["original_shape"],
-        )
-        extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = inputs["extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D"].to(x.dtype)
-        del inputs
-
-        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-            assert (
-                x.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
-            ), f"{x.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape} {original_shape}"
-
-        for _, block in self.blocks.items():
-            assert (
-                self.blocks["block0"].x_format == block.x_format
-            ), f"First block has x_format {self.blocks[0].x_format}, got {block.x_format}"
-
-            if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
-                x += extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D
-            x = block(
-                x,
-                affline_emb_B_D,
-                crossattn_emb,
-                crossattn_mask,
-                rope_emb_L_1_1_D=rope_emb_L_1_1_D,
-                adaln_lora_B_3D=adaln_lora_B_3D,
-            )
-
-        x_B_T_H_W_D = rearrange(x, "T H W B D -> B T H W D")
-
-        x_B_D_T_H_W = self.decoder_head(
-            x_B_T_H_W_D=x_B_T_H_W_D,
-            emb_B_D=affline_emb_B_D,
-            crossattn_emb=None,
-            origin_shape=original_shape,
-            crossattn_mask=None,
-            adaln_lora_B_3D=adaln_lora_B_3D,
-        )
-
-        return x_B_D_T_H_W
--- a/comfy/ldm/cosmos/position_embedding.py
+++ b/comfy/ldm/cosmos/position_embedding.py
@ -1,208 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-import torch
-from einops import rearrange, repeat
-from torch import nn
-import math
-
-
-def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0) -> torch.Tensor:
-    """
-    Normalizes the input tensor along specified dimensions such that the average square norm of elements is adjusted.
-
-    Args:
-        x (torch.Tensor): The input tensor to normalize.
-        dim (list, optional): The dimensions over which to normalize. If None, normalizes over all dimensions except the first.
-        eps (float, optional): A small constant to ensure numerical stability during division.
-
-    Returns:
-        torch.Tensor: The normalized tensor.
-    """
-    if dim is None:
-        dim = list(range(1, x.ndim))
-    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
-    norm = torch.add(eps, norm, alpha=math.sqrt(norm.numel() / x.numel()))
-    return x / norm.to(x.dtype)
-
-
-class VideoPositionEmb(nn.Module):
-    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
-        """
-        It delegates the embedding generation to generate_embeddings function.
-        """
-        B_T_H_W_C = x_B_T_H_W_C.shape
-        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype)
-
-        return embeddings
-
-    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None):
-        raise NotImplementedError
-
-
-class VideoRopePosition3DEmb(VideoPositionEmb):
-    def __init__(
-        self,
-        *,  # enforce keyword arguments
-        head_dim: int,
-        len_h: int,
-        len_w: int,
-        len_t: int,
-        base_fps: int = 24,
-        h_extrapolation_ratio: float = 1.0,
-        w_extrapolation_ratio: float = 1.0,
-        t_extrapolation_ratio: float = 1.0,
-        device=None,
-        **kwargs,  # used for compatibility with other positional embeddings; unused in this class
-    ):
-        del kwargs
-        super().__init__()
-        self.register_buffer("seq", torch.arange(max(len_h, len_w, len_t), dtype=torch.float, device=device))
-        self.base_fps = base_fps
-        self.max_h = len_h
-        self.max_w = len_w
-
-        dim = head_dim
-        dim_h = dim // 6 * 2
-        dim_w = dim_h
-        dim_t = dim - 2 * dim_h
-        assert dim == dim_h + dim_w + dim_t, f"bad dim: {dim} != {dim_h} + {dim_w} + {dim_t}"
-        self.register_buffer(
-            "dim_spatial_range",
-            torch.arange(0, dim_h, 2, device=device)[: (dim_h // 2)].float() / dim_h,
-            persistent=False,
-        )
-        self.register_buffer(
-            "dim_temporal_range",
-            torch.arange(0, dim_t, 2, device=device)[: (dim_t // 2)].float() / dim_t,
-            persistent=False,
-        )
-
-        self.h_ntk_factor = h_extrapolation_ratio ** (dim_h / (dim_h - 2))
-        self.w_ntk_factor = w_extrapolation_ratio ** (dim_w / (dim_w - 2))
-        self.t_ntk_factor = t_extrapolation_ratio ** (dim_t / (dim_t - 2))
-
-    def generate_embeddings(
-        self,
-        B_T_H_W_C: torch.Size,
-        fps: Optional[torch.Tensor] = None,
-        h_ntk_factor: Optional[float] = None,
-        w_ntk_factor: Optional[float] = None,
-        t_ntk_factor: Optional[float] = None,
-        device=None,
-        dtype=None,
-    ):
-        """
-        Generate embeddings for the given input size.
-
-        Args:
-            B_T_H_W_C (torch.Size): Input tensor size (Batch, Time, Height, Width, Channels).
-            fps (Optional[torch.Tensor], optional): Frames per second. Defaults to None.
-            h_ntk_factor (Optional[float], optional): Height NTK factor. If None, uses self.h_ntk_factor.
-            w_ntk_factor (Optional[float], optional): Width NTK factor. If None, uses self.w_ntk_factor.
-            t_ntk_factor (Optional[float], optional): Time NTK factor. If None, uses self.t_ntk_factor.
-
-        Returns:
-            Not specified in the original code snippet.
-        """
-        h_ntk_factor = h_ntk_factor if h_ntk_factor is not None else self.h_ntk_factor
-        w_ntk_factor = w_ntk_factor if w_ntk_factor is not None else self.w_ntk_factor
-        t_ntk_factor = t_ntk_factor if t_ntk_factor is not None else self.t_ntk_factor
-
-        h_theta = 10000.0 * h_ntk_factor
-        w_theta = 10000.0 * w_ntk_factor
-        t_theta = 10000.0 * t_ntk_factor
-
-        h_spatial_freqs = 1.0 / (h_theta**self.dim_spatial_range.to(device=device))
-        w_spatial_freqs = 1.0 / (w_theta**self.dim_spatial_range.to(device=device))
-        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))
-
-        B, T, H, W, _ = B_T_H_W_C
-        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
-        assert (
-            uniform_fps or B == 1 or T == 1
-        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
-        assert (
-            H <= self.max_h and W <= self.max_w
-        ), f"Input dimensions (H={H}, W={W}) exceed the maximum dimensions (max_h={self.max_h}, max_w={self.max_w})"
-        half_emb_h = torch.outer(self.seq[:H].to(device=device), h_spatial_freqs)
-        half_emb_w = torch.outer(self.seq[:W].to(device=device), w_spatial_freqs)
-
-        # apply sequence scaling in temporal dimension
-        if fps is None:  # image case
-            half_emb_t = torch.outer(self.seq[:T].to(device=device), temporal_freqs)
-        else:
-            half_emb_t = torch.outer(self.seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
-
-        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
-        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
-        half_emb_t = torch.stack([torch.cos(half_emb_t), -torch.sin(half_emb_t), torch.sin(half_emb_t), torch.cos(half_emb_t)], dim=-1)
-
-        em_T_H_W_D = torch.cat(
-            [
-                repeat(half_emb_t, "t d x -> t h w d x", h=H, w=W),
-                repeat(half_emb_h, "h d x -> t h w d x", t=T, w=W),
-                repeat(half_emb_w, "w d x -> t h w d x", t=T, h=H),
-            ]
-            , dim=-2,
-        )
-
-        return rearrange(em_T_H_W_D, "t h w d (i j) -> (t h w) d i j", i=2, j=2).float()
-
-
-class LearnablePosEmbAxis(VideoPositionEmb):
-    def __init__(
-        self,
-        *,  # enforce keyword arguments
-        interpolation: str,
-        model_channels: int,
-        len_h: int,
-        len_w: int,
-        len_t: int,
-        device=None,
-        dtype=None,
-        **kwargs,
-    ):
-        """
-        Args:
-            interpolation (str): we curretly only support "crop", ideally when we need extrapolation capacity, we should adjust frequency or other more advanced methods. they are not implemented yet.
-        """
-        del kwargs  # unused
-        super().__init__()
-        self.interpolation = interpolation
-        assert self.interpolation in ["crop"], f"Unknown interpolation method {self.interpolation}"
-
-        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device, dtype=dtype))
-        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype))
-        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype))
-
-    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
-        B, T, H, W, _ = B_T_H_W_C
-        if self.interpolation == "crop":
-            emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype)
-            emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype)
-            emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype)
-            emb = (
-                repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W)
-                + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W)
-                + repeat(emb_w_W, "w d-> b t h w d", b=B, t=T, h=H)
-            )
-            assert list(emb.shape)[:4] == [B, T, H, W], f"bad shape: {list(emb.shape)[:4]} != {B, T, H, W}"
-        else:
-            raise ValueError(f"Unknown interpolation method {self.interpolation}")
-
-        return normalize(emb, dim=-1, eps=1e-6)
--- a/comfy/ldm/cosmos/vae.py
+++ b/comfy/ldm/cosmos/vae.py
@ -1,131 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""The causal continuous video tokenizer with VAE or AE formulation for 3D data.."""
-
-import logging
-import torch
-from torch import nn
-from enum import Enum
-import math
-
-from .cosmos_tokenizer.layers3d import (
-    EncoderFactorized,
-    DecoderFactorized,
-    CausalConv3d,
-)
-
-
-class IdentityDistribution(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, parameters):
-        return parameters, (torch.tensor([0.0]), torch.tensor([0.0]))
-
-
-class GaussianDistribution(torch.nn.Module):
-    def __init__(self, min_logvar: float = -30.0, max_logvar: float = 20.0):
-        super().__init__()
-        self.min_logvar = min_logvar
-        self.max_logvar = max_logvar
-
-    def sample(self, mean, logvar):
-        std = torch.exp(0.5 * logvar)
-        return mean + std * torch.randn_like(mean)
-
-    def forward(self, parameters):
-        mean, logvar = torch.chunk(parameters, 2, dim=1)
-        logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar)
-        return self.sample(mean, logvar), (mean, logvar)
-
-
-class ContinuousFormulation(Enum):
-    VAE = GaussianDistribution
-    AE = IdentityDistribution
-
-
-class CausalContinuousVideoTokenizer(nn.Module):
-    def __init__(
-        self, z_channels: int, z_factor: int, latent_channels: int, **kwargs
-    ) -> None:
-        super().__init__()
-        self.name = kwargs.get("name", "CausalContinuousVideoTokenizer")
-        self.latent_channels = latent_channels
-        self.sigma_data = 0.5
-
-        # encoder_name = kwargs.get("encoder", Encoder3DType.BASE.name)
-        self.encoder = EncoderFactorized(
-            z_channels=z_factor * z_channels, **kwargs
-        )
-        if kwargs.get("temporal_compression", 4) == 4:
-            kwargs["channels_mult"] = [2, 4]
-        # decoder_name = kwargs.get("decoder", Decoder3DType.BASE.name)
-        self.decoder = DecoderFactorized(
-            z_channels=z_channels, **kwargs
-        )
-
-        self.quant_conv = CausalConv3d(
-            z_factor * z_channels,
-            z_factor * latent_channels,
-            kernel_size=1,
-            padding=0,
-        )
-        self.post_quant_conv = CausalConv3d(
-            latent_channels, z_channels, kernel_size=1, padding=0
-        )
-
-        # formulation_name = kwargs.get("formulation", ContinuousFormulation.AE.name)
-        self.distribution = IdentityDistribution()  # ContinuousFormulation[formulation_name].value()
-
-        num_parameters = sum(param.numel() for param in self.parameters())
-        logging.debug(f"model={self.name}, num_parameters={num_parameters:,}")
-        logging.debug(
-            f"z_channels={z_channels}, latent_channels={self.latent_channels}."
-        )
-
-        latent_temporal_chunk = 16
-        self.latent_mean = nn.Parameter(torch.zeros([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
-        self.latent_std = nn.Parameter(torch.ones([self.latent_channels * latent_temporal_chunk], dtype=torch.float32))
-
-
-    def encode(self, x):
-        h = self.encoder(x)
-        moments = self.quant_conv(h)
-        z, posteriors = self.distribution(moments)
-        latent_ch = z.shape[1]
-        latent_t = z.shape[2]
-        in_dtype = z.dtype
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        return ((z - mean) / std) * self.sigma_data
-
-    def decode(self, z):
-        in_dtype = z.dtype
-        latent_ch = z.shape[1]
-        latent_t = z.shape[2]
-        mean = self.latent_mean.view(latent_ch, -1)
-        std = self.latent_std.view(latent_ch, -1)
-
-        mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-        std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device)
-
-        z = z / self.sigma_data
-        z = z * std + mean
-        z = self.post_quant_conv(z)
-        return self.decoder(z)
-
--- a/comfy/ldm/flux/controlnet.py
+++ b/comfy/ldm/flux/controlnet.py
@ -1,203 +0,0 @@
-#Original code can be found on: https://github.com/XLabs-AI/x-flux/blob/main/src/flux/controlnet.py
-#modified to support different types of flux controlnets
-
-import torch
-import math
-from torch import Tensor, nn
-from einops import rearrange, repeat
-
-from .layers import (timestep_embedding)
-
-from .model import Flux
-import comfy.ldm.common_dit
-
-class MistolineCondDownsamplBlock(nn.Module):
-    def __init__(self, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.encoder = nn.Sequential(
-            operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 1, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
-        )
-
-    def forward(self, x):
-        return self.encoder(x)
-
-class MistolineControlnetBlock(nn.Module):
-    def __init__(self, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.linear = operations.Linear(hidden_size, hidden_size, dtype=dtype, device=device)
-        self.act = nn.SiLU()
-
-    def forward(self, x):
-        return self.act(self.linear(x))
-
-
-class ControlNetFlux(Flux):
-    def __init__(self, latent_input=False, num_union_modes=0, mistoline=False, control_latent_channels=None, image_model=None, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__(final_layer=False, dtype=dtype, device=device, operations=operations, **kwargs)
-
-        self.main_model_double = 19
-        self.main_model_single = 38
-
-        self.mistoline = mistoline
-        # add ControlNet blocks
-        if self.mistoline:
-            control_block = lambda : MistolineControlnetBlock(self.hidden_size, dtype=dtype, device=device, operations=operations)
-        else:
-            control_block = lambda : operations.Linear(self.hidden_size, self.hidden_size, dtype=dtype, device=device)
-
-        self.controlnet_blocks = nn.ModuleList([])
-        for _ in range(self.params.depth):
-            self.controlnet_blocks.append(control_block())
-
-        self.controlnet_single_blocks = nn.ModuleList([])
-        for _ in range(self.params.depth_single_blocks):
-            self.controlnet_single_blocks.append(control_block())
-
-        self.num_union_modes = num_union_modes
-        self.controlnet_mode_embedder = None
-        if self.num_union_modes > 0:
-            self.controlnet_mode_embedder = operations.Embedding(self.num_union_modes, self.hidden_size, dtype=dtype, device=device)
-
-        self.gradient_checkpointing = False
-        self.latent_input = latent_input
-        if control_latent_channels is None:
-            control_latent_channels = self.in_channels
-        else:
-            control_latent_channels *= 2 * 2 #patch size
-
-        self.pos_embed_input = operations.Linear(control_latent_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        if not self.latent_input:
-            if self.mistoline:
-                self.input_cond_block = MistolineCondDownsamplBlock(dtype=dtype, device=device, operations=operations)
-            else:
-                self.input_hint_block = nn.Sequential(
-                    operations.Conv2d(3, 16, 3, padding=1, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, stride=2, dtype=dtype, device=device),
-                    nn.SiLU(),
-                    operations.Conv2d(16, 16, 3, padding=1, dtype=dtype, device=device)
-                )
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        controlnet_cond: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor = None,
-        control_type: Tensor = None,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-
-        # running on sequences img
-        img = self.img_in(img)
-
-        controlnet_cond = self.pos_embed_input(controlnet_cond)
-        img = img + controlnet_cond
-        vec = self.time_in(timestep_embedding(timesteps, 256))
-        if self.params.guidance_embed:
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-
-        if self.controlnet_mode_embedder is not None and len(control_type) > 0:
-            control_cond = self.controlnet_mode_embedder(torch.tensor(control_type, device=img.device), out_dtype=img.dtype).unsqueeze(0).repeat((txt.shape[0], 1, 1))
-            txt = torch.cat([control_cond, txt], dim=1)
-            txt_ids = torch.cat([txt_ids[:,:1], txt_ids], dim=1)
-
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        controlnet_double = ()
-
-        for i in range(len(self.double_blocks)):
-            img, txt = self.double_blocks[i](img=img, txt=txt, vec=vec, pe=pe)
-            controlnet_double = controlnet_double + (self.controlnet_blocks[i](img),)
-
-        img = torch.cat((txt, img), 1)
-
-        controlnet_single = ()
-
-        for i in range(len(self.single_blocks)):
-            img = self.single_blocks[i](img, vec=vec, pe=pe)
-            controlnet_single = controlnet_single + (self.controlnet_single_blocks[i](img[:, txt.shape[1] :, ...]),)
-
-        repeat = math.ceil(self.main_model_double / len(controlnet_double))
-        if self.latent_input:
-            out_input = ()
-            for x in controlnet_double:
-                    out_input += (x,) * repeat
-        else:
-            out_input = (controlnet_double * repeat)
-
-        out = {"input": out_input[:self.main_model_double]}
-        if len(controlnet_single) > 0:
-            repeat = math.ceil(self.main_model_single / len(controlnet_single))
-            out_output = ()
-            if self.latent_input:
-                for x in controlnet_single:
-                        out_output += (x,) * repeat
-            else:
-                out_output = (controlnet_single * repeat)
-            out["output"] = out_output[:self.main_model_single]
-        return out
-
-    def forward(self, x, timesteps, context, y, guidance=None, hint=None, **kwargs):
-        patch_size = 2
-        if self.latent_input:
-            hint = comfy.ldm.common_dit.pad_to_patch_size(hint, (patch_size, patch_size))
-        elif self.mistoline:
-            hint = hint * 2.0 - 1.0
-            hint = self.input_cond_block(hint)
-        else:
-            hint = hint * 2.0 - 1.0
-            hint = self.input_hint_block(hint)
-
-        hint = rearrange(hint, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-
-        bs, c, h, w = x.shape
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
-
-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-
-        h_len = ((h + (patch_size // 2)) // patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
-        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
-        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
-        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
-
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        return self.forward_orig(img, img_ids, hint, context, txt_ids, timesteps, y, guidance, control_type=kwargs.get("control_type", []))
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -6,7 +6,6 @@ from torch import Tensor, nn

 from .math import attention, rope
 import comfy.ops
-import comfy.ldm.common_dit


 class EmbedND(nn.Module):
@ -64,7 +63,10 @@ class RMSNorm(torch.nn.Module):
        self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))

    def forward(self, x: Tensor):
-        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * comfy.ops.cast_to(self.scale, dtype=x_dtype, device=x.device)


 class QKNorm(torch.nn.Module):
@ -105,9 +107,7 @@ class Modulation(nn.Module):
        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)

    def forward(self, vec: Tensor) -> tuple:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-        out = self.lin(nn.functional.silu(vec)).chunk(self.multiplier, dim=-1)
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)

        return (
            ModulationOut(*out[:3]),
@ -115,22 +115,8 @@ class Modulation(nn.Module):
        )


-def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
-    if modulation_dims is None:
-        if m_add is not None:
-            return tensor * m_mult + m_add
-        else:
-            return tensor * m_mult
-    else:
-        for d in modulation_dims:
-            tensor[:, d[0]:d[1]] *= m_mult[:, d[2]]
-            if m_add is not None:
-                tensor[:, d[0]:d[1]] += m_add[:, d[2]]
-        return tensor
-
-
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, dtype=None, device=None, operations=None):
        super().__init__()

        mlp_hidden_dim = int(hidden_size * mlp_ratio)
@ -157,53 +143,42 @@ class DoubleStreamBlock(nn.Module):
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
-        self.flipped_img_txt = flipped_img_txt

-    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None):
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor):
        img_mod1, img_mod2 = self.img_mod(vec)
        txt_mod1, txt_mod2 = self.txt_mod(vec)

        # prepare image for attention
        img_modulated = self.img_norm1(img)
-        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)

        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)

-        if self.flipped_img_txt:
-            # run actual attention
-            attn = attention(torch.cat((img_q, txt_q), dim=2),
-                             torch.cat((img_k, txt_k), dim=2),
-                             torch.cat((img_v, txt_v), dim=2),
-                             pe=pe, mask=attn_mask)
-
-            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
-        else:
        # run actual attention
        attn = attention(torch.cat((txt_q, img_q), dim=2),
                         torch.cat((txt_k, img_k), dim=2),
-                             torch.cat((txt_v, img_v), dim=2),
-                             pe=pe, mask=attn_mask)
+                         torch.cat((txt_v, img_v), dim=2), pe=pe)

        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]

        # calculate the img bloks
-        img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
-        img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
+        img += img_mod1.gate * self.img_attn.proj(img_attn)
+        img += img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)

        # calculate the txt bloks
-        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
-        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
+        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)

        if txt.dtype == torch.float16:
-            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
+            txt = txt.clip(-65504, 65504)

        return img, txt

@ -244,20 +219,21 @@ class SingleStreamBlock(nn.Module):
        self.mlp_act = nn.GELU(approximate="tanh")
        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)

-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None) -> Tensor:
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
        mod, _ = self.modulation(vec)
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)

        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)

        # compute attention
-        attn = attention(q, k, v, pe=pe, mask=attn_mask)
+        attn = attention(q, k, v, pe=pe)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += apply_mod(output, mod.gate, None, modulation_dims)
+        x += mod.gate * output
        if x.dtype == torch.float16:
-            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
+            x = x.clip(-65504, 65504)
        return x


@ -268,11 +244,8 @@ class LastLayer(nn.Module):
        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))

-    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
-        if vec.ndim == 2:
-            vec = vec[:, None, :]
-
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=-1)
-        x = apply_mod(self.norm_final(x), (1 + scale), shift, modulation_dims)
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
        x = self.linear(x)
        return x
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -1,29 +1,20 @@
 import torch
 from einops import rearrange
 from torch import Tensor
-
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.model_management

-
-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor:
-    q_shape = q.shape
-    k_shape = k.shape
-
-    if pe is not None:
-        q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2)
-        k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2)
-        q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v)
-        k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v)
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)

    heads = q.shape[1]
-    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask)
+    x = optimized_attention(q, k, v, heads, skip_reshape=True)
    return x


 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled():
+    if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu():
        device = torch.device("cpu")
    else:
        device = pos.device
@ -37,9 +28,8 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor:


 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor):
-    xq_ = xq.to(dtype=freqs_cis.dtype).reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.to(dtype=freqs_cis.dtype).reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
-
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -4,8 +4,6 @@ from dataclasses import dataclass

 import torch
 from torch import Tensor, nn
-from einops import rearrange, repeat
-import comfy.ldm.common_dit

 from .layers import (
    DoubleStreamBlock,
@ -16,10 +14,12 @@ from .layers import (
    timestep_embedding,
 )

+from einops import rearrange, repeat
+import comfy.ldm.common_dit
+
@dataclass
 class FluxParams:
    in_channels: int
-    out_channels: int
    vec_in_dim: int
    context_in_dim: int
    hidden_size: int
@ -29,7 +29,6 @@ class FluxParams:
    depth_single_blocks: int
    axes_dim: list
    theta: int
-    patch_size: int
    qkv_bias: bool
    guidance_embed: bool

@ -39,14 +38,13 @@ class Flux(nn.Module):
    Transformer model for flow matching on sequences.
    """

-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
+    def __init__(self, image_model=None, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
        params = FluxParams(**kwargs)
        self.params = params
-        self.patch_size = params.patch_size
-        self.in_channels = params.in_channels * params.patch_size * params.patch_size
-        self.out_channels = params.out_channels * params.patch_size * params.patch_size
+        self.in_channels = params.in_channels * 2 * 2
+        self.out_channels = self.in_channels
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@ -85,7 +83,6 @@ class Flux(nn.Module):
            ]
        )

-        if final_layer:
        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)

    def forward_orig(
@ -97,11 +94,7 @@ class Flux(nn.Module):
        timesteps: Tensor,
        y: Tensor,
        guidance: Tensor = None,
-        control = None,
-        transformer_options={},
-        attn_mask: Tensor = None,
    ) -> Tensor:
-        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
            raise ValueError("Input img and txt tensors must have 3 dimensions.")

@ -109,88 +102,30 @@ class Flux(nn.Module):
        img = self.img_in(img)
        vec = self.time_in(timestep_embedding(timesteps, 256).to(img.dtype))
        if self.params.guidance_embed:
-            if guidance is not None:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
            vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))

-        vec = vec + self.vector_in(y[:,:self.params.vec_in_dim])
+        vec = vec + self.vector_in(y)
        txt = self.txt_in(txt)

-        if img_ids is not None:
        ids = torch.cat((txt_ids, img_ids), dim=1)
        pe = self.pe_embedder(ids)
-        else:
-            pe = None

-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"],
-                                                   txt=args["txt"],
-                                                   vec=args["vec"],
-                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": img,
-                                                           "txt": txt,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                txt = out["txt"]
-                img = out["img"]
-            else:
-                img, txt = block(img=img,
-                                 txt=txt,
-                                 vec=vec,
-                                 pe=pe,
-                                 attn_mask=attn_mask)
-
-            if control is not None: # Controlnet
-                control_i = control.get("input")
-                if i < len(control_i):
-                    add = control_i[i]
-                    if add is not None:
-                        img += add
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)

        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if ("single_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"],
-                                       vec=args["vec"],
-                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("single_block", i)]({"img": img,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                img = out["img"]
-            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
-
-            if control is not None: # Controlnet
-                control_o = control.get("output")
-                if i < len(control_o):
-                    add = control_o[i]
-                    if add is not None:
-                        img[:, txt.shape[1] :, ...] += add
-
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
        img = img[:, txt.shape[1] :, ...]

        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
        return img

-    def forward(self, x, timestep, context, y, guidance=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y, guidance, **kwargs):
        bs, c, h, w = x.shape
-        patch_size = self.patch_size
+        patch_size = 2
        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))

        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
@ -198,10 +133,10 @@ class Flux(nn.Module):
        h_len = ((h + (patch_size // 2)) // patch_size)
        w_len = ((w + (patch_size // 2)) // patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[..., 1] = img_ids[..., 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype)[:, None]
+        img_ids[..., 2] = img_ids[..., 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype)[None, :]
        img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)

        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
+        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance)
        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
--- a/comfy/ldm/flux/redux.py
+++ b/comfy/ldm/flux/redux.py
@ -1,25 +0,0 @@
-import torch
-import comfy.ops
-
-ops = comfy.ops.manual_cast
-
-class ReduxImageEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        redux_dim: int = 1152,
-        txt_in_features: int = 4096,
-        device=None,
-        dtype=None,
-    ) -> None:
-        super().__init__()
-
-        self.redux_dim = redux_dim
-        self.device = device
-        self.dtype = dtype
-
-        self.redux_up = ops.Linear(redux_dim, txt_in_features * 3, dtype=dtype)
-        self.redux_down = ops.Linear(txt_in_features * 3, txt_in_features, dtype=dtype)
-
-    def forward(self, sigclip_embeds) -> torch.Tensor:
-        projected_x = self.redux_down(torch.nn.functional.silu(self.redux_up(sigclip_embeds)))
-        return projected_x
--- a/comfy/ldm/genmo/joint_model/asymm_models_joint.py
+++ b/comfy/ldm/genmo/joint_model/asymm_models_joint.py
@ -1,557 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-#adapted to ComfyUI
-
-from typing import Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-# from flash_attn import flash_attn_varlen_qkvpacked_func
-from comfy.ldm.modules.attention import optimized_attention
-
-from .layers import (
-    FeedForward,
-    PatchEmbed,
-    RMSNorm,
-    TimestepEmbedder,
-)
-
-from .rope_mixed import (
-    compute_mixed_rotation,
-    create_position_matrix,
-)
-from .temporal_rope import apply_rotary_emb_qk_real
-from .utils import (
-    AttentionPool,
-    modulate,
-)
-
-import comfy.ldm.common_dit
-import comfy.ops
-
-
-def modulated_rmsnorm(x, scale, eps=1e-6):
-    # Normalize and modulate
-    x_normed = comfy.ldm.common_dit.rms_norm(x, eps=eps)
-    x_modulated = x_normed * (1 + scale.unsqueeze(1))
-
-    return x_modulated
-
-
-def residual_tanh_gated_rmsnorm(x, x_res, gate, eps=1e-6):
-    # Apply tanh to gate
-    tanh_gate = torch.tanh(gate).unsqueeze(1)
-
-    # Normalize and apply gated scaling
-    x_normed = comfy.ldm.common_dit.rms_norm(x_res, eps=eps) * tanh_gate
-
-    # Apply residual connection
-    output = x + x_normed
-
-    return output
-
-class AsymmetricAttention(nn.Module):
-    def __init__(
-        self,
-        dim_x: int,
-        dim_y: int,
-        num_heads: int = 8,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        attn_drop: float = 0.0,
-        update_y: bool = True,
-        out_bias: bool = True,
-        attend_to_padding: bool = False,
-        softmax_scale: Optional[float] = None,
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.dim_x = dim_x
-        self.dim_y = dim_y
-        self.num_heads = num_heads
-        self.head_dim = dim_x // num_heads
-        self.attn_drop = attn_drop
-        self.update_y = update_y
-        self.attend_to_padding = attend_to_padding
-        self.softmax_scale = softmax_scale
-        if dim_x % num_heads != 0:
-            raise ValueError(
-                f"dim_x={dim_x} should be divisible by num_heads={num_heads}"
-            )
-
-        # Input layers.
-        self.qkv_bias = qkv_bias
-        self.qkv_x = operations.Linear(dim_x, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)
-        # Project text features to match visual features (dim_y -> dim_x)
-        self.qkv_y = operations.Linear(dim_y, 3 * dim_x, bias=qkv_bias, device=device, dtype=dtype)
-
-        # Query and key normalization for stability.
-        assert qk_norm
-        self.q_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.k_norm_x = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.q_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
-        self.k_norm_y = RMSNorm(self.head_dim, device=device, dtype=dtype)
-
-        # Output layers. y features go back down from dim_x -> dim_y.
-        self.proj_x = operations.Linear(dim_x, dim_x, bias=out_bias, device=device, dtype=dtype)
-        self.proj_y = (
-            operations.Linear(dim_x, dim_y, bias=out_bias, device=device, dtype=dtype)
-            if update_y
-            else nn.Identity()
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,  # (B, N, dim_x)
-        y: torch.Tensor,  # (B, L, dim_y)
-        scale_x: torch.Tensor,  # (B, dim_x), modulation for pre-RMSNorm.
-        scale_y: torch.Tensor,  # (B, dim_y), modulation for pre-RMSNorm.
-        crop_y,
-        **rope_rotation,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        rope_cos = rope_rotation.get("rope_cos")
-        rope_sin = rope_rotation.get("rope_sin")
-        # Pre-norm for visual features
-        x = modulated_rmsnorm(x, scale_x)  # (B, M, dim_x) where M = N / cp_group_size
-
-        # Process visual features
-        # qkv_x = self.qkv_x(x)  # (B, M, 3 * dim_x)
-        # assert qkv_x.dtype == torch.bfloat16
-        # qkv_x = all_to_all_collect_tokens(
-        #     qkv_x, self.num_heads
-        # )  # (3, B, N, local_h, head_dim)
-
-        # Process text features
-        y = modulated_rmsnorm(y, scale_y)  # (B, L, dim_y)
-        q_y, k_y, v_y = self.qkv_y(y).view(y.shape[0], y.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)
-
-        q_y = self.q_norm_y(q_y)
-        k_y = self.k_norm_y(k_y)
-
-        # Split qkv_x into q, k, v
-        q_x, k_x, v_x = self.qkv_x(x).view(x.shape[0], x.shape[1], 3, self.num_heads, -1).unbind(2)  # (B, N, local_h, head_dim)
-        q_x = self.q_norm_x(q_x)
-        q_x = apply_rotary_emb_qk_real(q_x, rope_cos, rope_sin)
-        k_x = self.k_norm_x(k_x)
-        k_x = apply_rotary_emb_qk_real(k_x, rope_cos, rope_sin)
-
-        q = torch.cat([q_x, q_y[:, :crop_y]], dim=1).transpose(1, 2)
-        k = torch.cat([k_x, k_y[:, :crop_y]], dim=1).transpose(1, 2)
-        v = torch.cat([v_x, v_y[:, :crop_y]], dim=1).transpose(1, 2)
-
-        xy = optimized_attention(q,
-                                 k,
-                                 v, self.num_heads, skip_reshape=True)
-
-        x, y = torch.tensor_split(xy, (q_x.shape[1],), dim=1)
-        x = self.proj_x(x)
-        o = torch.zeros(y.shape[0], q_y.shape[1], y.shape[-1], device=y.device, dtype=y.dtype)
-        o[:, :y.shape[1]] = y
-
-        y = self.proj_y(o)
-        # print("ox", x)
-        # print("oy", y)
-        return x, y
-
-
-class AsymmetricJointBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size_x: int,
-        hidden_size_y: int,
-        num_heads: int,
-        *,
-        mlp_ratio_x: float = 8.0,  # Ratio of hidden size to d_model for MLP for visual tokens.
-        mlp_ratio_y: float = 4.0,  # Ratio of hidden size to d_model for MLP for text tokens.
-        update_y: bool = True,  # Whether to update text tokens in this block.
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-        **block_kwargs,
-    ):
-        super().__init__()
-        self.update_y = update_y
-        self.hidden_size_x = hidden_size_x
-        self.hidden_size_y = hidden_size_y
-        self.mod_x = operations.Linear(hidden_size_x, 4 * hidden_size_x, device=device, dtype=dtype)
-        if self.update_y:
-            self.mod_y = operations.Linear(hidden_size_x, 4 * hidden_size_y, device=device, dtype=dtype)
-        else:
-            self.mod_y = operations.Linear(hidden_size_x, hidden_size_y, device=device, dtype=dtype)
-
-        # Self-attention:
-        self.attn = AsymmetricAttention(
-            hidden_size_x,
-            hidden_size_y,
-            num_heads=num_heads,
-            update_y=update_y,
-            device=device,
-            dtype=dtype,
-            operations=operations,
-            **block_kwargs,
-        )
-
-        # MLP.
-        mlp_hidden_dim_x = int(hidden_size_x * mlp_ratio_x)
-        assert mlp_hidden_dim_x == int(1536 * 8)
-        self.mlp_x = FeedForward(
-            in_features=hidden_size_x,
-            hidden_size=mlp_hidden_dim_x,
-            multiple_of=256,
-            ffn_dim_multiplier=None,
-            device=device,
-            dtype=dtype,
-            operations=operations,
-        )
-
-        # MLP for text not needed in last block.
-        if self.update_y:
-            mlp_hidden_dim_y = int(hidden_size_y * mlp_ratio_y)
-            self.mlp_y = FeedForward(
-                in_features=hidden_size_y,
-                hidden_size=mlp_hidden_dim_y,
-                multiple_of=256,
-                ffn_dim_multiplier=None,
-                device=device,
-                dtype=dtype,
-                operations=operations,
-            )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,
-        y: torch.Tensor,
-        **attn_kwargs,
-    ):
-        """Forward pass of a block.
-
-        Args:
-            x: (B, N, dim) tensor of visual tokens
-            c: (B, dim) tensor of conditioned features
-            y: (B, L, dim) tensor of text tokens
-            num_frames: Number of frames in the video. N = num_frames * num_spatial_tokens
-
-        Returns:
-            x: (B, N, dim) tensor of visual tokens after block
-            y: (B, L, dim) tensor of text tokens after block
-        """
-        N = x.size(1)
-
-        c = F.silu(c)
-        mod_x = self.mod_x(c)
-        scale_msa_x, gate_msa_x, scale_mlp_x, gate_mlp_x = mod_x.chunk(4, dim=1)
-
-        mod_y = self.mod_y(c)
-        if self.update_y:
-            scale_msa_y, gate_msa_y, scale_mlp_y, gate_mlp_y = mod_y.chunk(4, dim=1)
-        else:
-            scale_msa_y = mod_y
-
-        # Self-attention block.
-        x_attn, y_attn = self.attn(
-            x,
-            y,
-            scale_x=scale_msa_x,
-            scale_y=scale_msa_y,
-            **attn_kwargs,
-        )
-
-        assert x_attn.size(1) == N
-        x = residual_tanh_gated_rmsnorm(x, x_attn, gate_msa_x)
-        if self.update_y:
-            y = residual_tanh_gated_rmsnorm(y, y_attn, gate_msa_y)
-
-        # MLP block.
-        x = self.ff_block_x(x, scale_mlp_x, gate_mlp_x)
-        if self.update_y:
-            y = self.ff_block_y(y, scale_mlp_y, gate_mlp_y)
-
-        return x, y
-
-    def ff_block_x(self, x, scale_x, gate_x):
-        x_mod = modulated_rmsnorm(x, scale_x)
-        x_res = self.mlp_x(x_mod)
-        x = residual_tanh_gated_rmsnorm(x, x_res, gate_x)  # Sandwich norm
-        return x
-
-    def ff_block_y(self, y, scale_y, gate_y):
-        y_mod = modulated_rmsnorm(y, scale_y)
-        y_res = self.mlp_y(y_mod)
-        y = residual_tanh_gated_rmsnorm(y, y_res, gate_y)  # Sandwich norm
-        return y
-
-
-class FinalLayer(nn.Module):
-    """
-    The final layer of DiT.
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        patch_size,
-        out_channels,
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.norm_final = operations.LayerNorm(
-            hidden_size, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype
-        )
-        self.mod = operations.Linear(hidden_size, 2 * hidden_size, device=device, dtype=dtype)
-        self.linear = operations.Linear(
-            hidden_size, patch_size * patch_size * out_channels, device=device, dtype=dtype
-        )
-
-    def forward(self, x, c):
-        c = F.silu(c)
-        shift, scale = self.mod(c).chunk(2, dim=1)
-        x = modulate(self.norm_final(x), shift, scale)
-        x = self.linear(x)
-        return x
-
-
-class AsymmDiTJoint(nn.Module):
-    """
-    Diffusion model with a Transformer backbone.
-
-    Ingests text embeddings instead of a label.
-    """
-
-    def __init__(
-        self,
-        *,
-        patch_size=2,
-        in_channels=4,
-        hidden_size_x=1152,
-        hidden_size_y=1152,
-        depth=48,
-        num_heads=16,
-        mlp_ratio_x=8.0,
-        mlp_ratio_y=4.0,
-        use_t5: bool = False,
-        t5_feat_dim: int = 4096,
-        t5_token_length: int = 256,
-        learn_sigma=True,
-        patch_embed_bias: bool = True,
-        timestep_mlp_bias: bool = True,
-        attend_to_padding: bool = False,
-        timestep_scale: Optional[float] = None,
-        use_extended_posenc: bool = False,
-        posenc_preserve_area: bool = False,
-        rope_theta: float = 10000.0,
-        image_model=None,
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-        **block_kwargs,
-    ):
-        super().__init__()
-
-        self.dtype = dtype
-        self.learn_sigma = learn_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels * 2 if learn_sigma else in_channels
-        self.patch_size = patch_size
-        self.num_heads = num_heads
-        self.hidden_size_x = hidden_size_x
-        self.hidden_size_y = hidden_size_y
-        self.head_dim = (
-            hidden_size_x // num_heads
-        )  # Head dimension and count is determined by visual.
-        self.attend_to_padding = attend_to_padding
-        self.use_extended_posenc = use_extended_posenc
-        self.posenc_preserve_area = posenc_preserve_area
-        self.use_t5 = use_t5
-        self.t5_token_length = t5_token_length
-        self.t5_feat_dim = t5_feat_dim
-        self.rope_theta = (
-            rope_theta  # Scaling factor for frequency computation for temporal RoPE.
-        )
-
-        self.x_embedder = PatchEmbed(
-            patch_size=patch_size,
-            in_chans=in_channels,
-            embed_dim=hidden_size_x,
-            bias=patch_embed_bias,
-            dtype=dtype,
-            device=device,
-            operations=operations
-        )
-        # Conditionings
-        # Timestep
-        self.t_embedder = TimestepEmbedder(
-            hidden_size_x, bias=timestep_mlp_bias, timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
-        )
-
-        if self.use_t5:
-            # Caption Pooling (T5)
-            self.t5_y_embedder = AttentionPool(
-                t5_feat_dim, num_heads=8, output_dim=hidden_size_x, dtype=dtype, device=device, operations=operations
-            )
-
-            # Dense Embedding Projection (T5)
-            self.t5_yproj = operations.Linear(
-                t5_feat_dim, hidden_size_y, bias=True, dtype=dtype, device=device
-            )
-
-        # Initialize pos_frequencies as an empty parameter.
-        self.pos_frequencies = nn.Parameter(
-            torch.empty(3, self.num_heads, self.head_dim // 2, dtype=dtype, device=device)
-        )
-
-        assert not self.attend_to_padding
-
-        # for depth 48:
-        #  b =  0: AsymmetricJointBlock, update_y=True
-        #  b =  1: AsymmetricJointBlock, update_y=True
-        #  ...
-        #  b = 46: AsymmetricJointBlock, update_y=True
-        #  b = 47: AsymmetricJointBlock, update_y=False. No need to update text features.
-        blocks = []
-        for b in range(depth):
-            # Joint multi-modal block
-            update_y = b < depth - 1
-            block = AsymmetricJointBlock(
-                hidden_size_x,
-                hidden_size_y,
-                num_heads,
-                mlp_ratio_x=mlp_ratio_x,
-                mlp_ratio_y=mlp_ratio_y,
-                update_y=update_y,
-                attend_to_padding=attend_to_padding,
-                device=device,
-                dtype=dtype,
-                operations=operations,
-                **block_kwargs,
-            )
-
-            blocks.append(block)
-        self.blocks = nn.ModuleList(blocks)
-
-        self.final_layer = FinalLayer(
-            hidden_size_x, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
-        )
-
-    def embed_x(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x: (B, C=12, T, H, W) tensor of visual tokens
-
-        Returns:
-            x: (B, C=3072, N) tensor of visual tokens with positional embedding.
-        """
-        return self.x_embedder(x)  # Convert BcTHW to BCN
-
-    def prepare(
-        self,
-        x: torch.Tensor,
-        sigma: torch.Tensor,
-        t5_feat: torch.Tensor,
-        t5_mask: torch.Tensor,
-    ):
-        """Prepare input and conditioning embeddings."""
-        # Visual patch embeddings with positional encoding.
-        T, H, W = x.shape[-3:]
-        pH, pW = H // self.patch_size, W // self.patch_size
-        x = self.embed_x(x)  # (B, N, D), where N = T * H * W / patch_size ** 2
-        assert x.ndim == 3
-
-        pH, pW = H // self.patch_size, W // self.patch_size
-        N = T * pH * pW
-        assert x.size(1) == N
-        pos = create_position_matrix(
-            T, pH=pH, pW=pW, device=x.device, dtype=torch.float32
-        )  # (N, 3)
-        rope_cos, rope_sin = compute_mixed_rotation(
-            freqs=comfy.ops.cast_to(self.pos_frequencies, dtype=x.dtype, device=x.device), pos=pos
-        )  # Each are (N, num_heads, dim // 2)
-
-        c_t = self.t_embedder(1 - sigma, out_dtype=x.dtype)  # (B, D)
-
-        t5_y_pool = self.t5_y_embedder(t5_feat, t5_mask)  # (B, D)
-
-        c = c_t + t5_y_pool
-
-        y_feat = self.t5_yproj(t5_feat)  # (B, L, t5_feat_dim) --> (B, L, D)
-
-        return x, c, y_feat, rope_cos, rope_sin
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        timestep: torch.Tensor,
-        context: List[torch.Tensor],
-        attention_mask: List[torch.Tensor],
-        num_tokens=256,
-        packed_indices: Dict[str, torch.Tensor] = None,
-        rope_cos: torch.Tensor = None,
-        rope_sin: torch.Tensor = None,
-        control=None, transformer_options={}, **kwargs
-    ):
-        patches_replace = transformer_options.get("patches_replace", {})
-        y_feat = context
-        y_mask = attention_mask
-        sigma = timestep
-        """Forward pass of DiT.
-
-        Args:
-            x: (B, C, T, H, W) tensor of spatial inputs (images or latent representations of images)
-            sigma: (B,) tensor of noise standard deviations
-            y_feat: List((B, L, y_feat_dim) tensor of caption token features. For SDXL text encoders: L=77, y_feat_dim=2048)
-            y_mask: List((B, L) boolean tensor indicating which tokens are not padding)
-            packed_indices: Dict with keys for Flash Attention. Result of compute_packed_indices.
-        """
-        B, _, T, H, W = x.shape
-
-        x, c, y_feat, rope_cos, rope_sin = self.prepare(
-            x, sigma, y_feat, y_mask
-        )
-        del y_mask
-
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(
-                                                    args["img"],
-                                                    args["vec"],
-                                                    args["txt"],
-                                                    rope_cos=args["rope_cos"],
-                                                    rope_sin=args["rope_sin"],
-                                                    crop_y=args["num_tokens"]
-                                                    )
-                    return out
-                out = blocks_replace[("double_block", i)]({"img": x, "txt": y_feat, "vec": c, "rope_cos": rope_cos, "rope_sin": rope_sin, "num_tokens": num_tokens}, {"original_block": block_wrap})
-                y_feat = out["txt"]
-                x = out["img"]
-            else:
-                x, y_feat = block(
-                    x,
-                    c,
-                    y_feat,
-                    rope_cos=rope_cos,
-                    rope_sin=rope_sin,
-                    crop_y=num_tokens,
-                )  # (B, M, D), (B, L, D)
-        del y_feat  # Final layers don't use dense text features.
-
-        x = self.final_layer(x, c)  # (B, M, patch_size ** 2 * out_channels)
-        x = rearrange(
-            x,
-            "B (T hp wp) (p1 p2 c) -> B c T (hp p1) (wp p2)",
-            T=T,
-            hp=H // self.patch_size,
-            wp=W // self.patch_size,
-            p1=self.patch_size,
-            p2=self.patch_size,
-            c=self.out_channels,
-        )
-
-        return -x
--- a/comfy/ldm/genmo/joint_model/layers.py
+++ b/comfy/ldm/genmo/joint_model/layers.py
@ -1,164 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-#adapted to ComfyUI
-
-import collections.abc
-import math
-from itertools import repeat
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-import comfy.ldm.common_dit
-
-
-# From PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
-            return tuple(x)
-        return tuple(repeat(x, n))
-
-    return parse
-
-
-to_2tuple = _ntuple(2)
-
-
-class TimestepEmbedder(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        frequency_embedding_size: int = 256,
-        *,
-        bias: bool = True,
-        timestep_scale: Optional[float] = None,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.mlp = nn.Sequential(
-            operations.Linear(frequency_embedding_size, hidden_size, bias=bias, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(hidden_size, hidden_size, bias=bias, dtype=dtype, device=device),
-        )
-        self.frequency_embedding_size = frequency_embedding_size
-        self.timestep_scale = timestep_scale
-
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        half = dim // 2
-        freqs = torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
-        freqs.mul_(-math.log(max_period) / half).exp_()
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        return embedding
-
-    def forward(self, t, out_dtype):
-        if self.timestep_scale is not None:
-            t = t * self.timestep_scale
-        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(dtype=out_dtype)
-        t_emb = self.mlp(t_freq)
-        return t_emb
-
-
-class FeedForward(nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        hidden_size: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-    ):
-        super().__init__()
-        # keep parameter count and computation constant compared to standard FFN
-        hidden_size = int(2 * hidden_size / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_size = int(ffn_dim_multiplier * hidden_size)
-        hidden_size = multiple_of * ((hidden_size + multiple_of - 1) // multiple_of)
-
-        self.hidden_dim = hidden_size
-        self.w1 = operations.Linear(in_features, 2 * hidden_size, bias=False, device=device, dtype=dtype)
-        self.w2 = operations.Linear(hidden_size, in_features, bias=False, device=device, dtype=dtype)
-
-    def forward(self, x):
-        x, gate = self.w1(x).chunk(2, dim=-1)
-        x = self.w2(F.silu(x) * gate)
-        return x
-
-
-class PatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size: int = 16,
-        in_chans: int = 3,
-        embed_dim: int = 768,
-        norm_layer: Optional[Callable] = None,
-        flatten: bool = True,
-        bias: bool = True,
-        dynamic_img_pad: bool = False,
-        dtype=None,
-        device=None,
-        operations=None,
-    ):
-        super().__init__()
-        self.patch_size = to_2tuple(patch_size)
-        self.flatten = flatten
-        self.dynamic_img_pad = dynamic_img_pad
-
-        self.proj = operations.Conv2d(
-            in_chans,
-            embed_dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            bias=bias,
-            device=device,
-            dtype=dtype,
-        )
-        assert norm_layer is None
-        self.norm = (
-            norm_layer(embed_dim, device=device) if norm_layer else nn.Identity()
-        )
-
-    def forward(self, x):
-        B, _C, T, H, W = x.shape
-        if not self.dynamic_img_pad:
-            assert H % self.patch_size[0] == 0, f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]})."
-            assert W % self.patch_size[1] == 0, f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]})."
-        else:
-            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
-            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
-            x = F.pad(x, (0, pad_w, 0, pad_h))
-
-        x = rearrange(x, "B C T H W -> (B T) C H W", B=B, T=T)
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size, padding_mode='circular')
-        x = self.proj(x)
-
-        # Flatten temporal and spatial dimensions.
-        if not self.flatten:
-            raise NotImplementedError("Must flatten output.")
-        x = rearrange(x, "(B T) C H W -> B (T H W) C", B=B, T=T)
-
-        x = self.norm(x)
-        return x
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
-        super().__init__()
-        self.eps = eps
-        self.weight = torch.nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
-        self.register_parameter("bias", None)
-
-    def forward(self, x):
-        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
--- a/comfy/ldm/genmo/joint_model/rope_mixed.py
+++ b/comfy/ldm/genmo/joint_model/rope_mixed.py
@ -1,88 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-
-# import functools
-import math
-
-import torch
-
-
-def centers(start: float, stop, num, dtype=None, device=None):
-    """linspace through bin centers.
-
-    Args:
-        start (float): Start of the range.
-        stop (float): End of the range.
-        num (int): Number of points.
-        dtype (torch.dtype): Data type of the points.
-        device (torch.device): Device of the points.
-
-    Returns:
-        centers (Tensor): Centers of the bins. Shape: (num,).
-    """
-    edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
-    return (edges[:-1] + edges[1:]) / 2
-
-
-# @functools.lru_cache(maxsize=1)
-def create_position_matrix(
-    T: int,
-    pH: int,
-    pW: int,
-    device: torch.device,
-    dtype: torch.dtype,
-    *,
-    target_area: float = 36864,
-):
-    """
-    Args:
-        T: int - Temporal dimension
-        pH: int - Height dimension after patchify
-        pW: int - Width dimension after patchify
-
-    Returns:
-        pos: [T * pH * pW, 3] - position matrix
-    """
-    # Create 1D tensors for each dimension
-    t = torch.arange(T, dtype=dtype)
-
-    # Positionally interpolate to area 36864.
-    # (3072x3072 frame with 16x16 patches = 192x192 latents).
-    # This automatically scales rope positions when the resolution changes.
-    # We use a large target area so the model is more sensitive
-    # to changes in the learned pos_frequencies matrix.
-    scale = math.sqrt(target_area / (pW * pH))
-    w = centers(-pW * scale / 2, pW * scale / 2, pW)
-    h = centers(-pH * scale / 2, pH * scale / 2, pH)
-
-    # Use meshgrid to create 3D grids
-    grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")
-
-    # Stack and reshape the grids.
-    pos = torch.stack([grid_t, grid_h, grid_w], dim=-1)  # [T, pH, pW, 3]
-    pos = pos.view(-1, 3)  # [T * pH * pW, 3]
-    pos = pos.to(dtype=dtype, device=device)
-
-    return pos
-
-
-def compute_mixed_rotation(
-    freqs: torch.Tensor,
-    pos: torch.Tensor,
-):
-    """
-    Project each 3-dim position into per-head, per-head-dim 1D frequencies.
-
-    Args:
-        freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position
-        pos: [N, 3] - position of each token
-        num_heads: int
-
-    Returns:
-        freqs_cos: [N, num_heads, num_freqs] - cosine components
-        freqs_sin: [N, num_heads, num_freqs] - sine components
-    """
-    assert freqs.ndim == 3
-    freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)
-    freqs_cos = torch.cos(freqs_sum)
-    freqs_sin = torch.sin(freqs_sum)
-    return freqs_cos, freqs_sin
--- a/comfy/ldm/genmo/joint_model/temporal_rope.py
+++ b/comfy/ldm/genmo/joint_model/temporal_rope.py
@ -1,34 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-
-# Based on Llama3 Implementation.
-import torch
-
-
-def apply_rotary_emb_qk_real(
-    xqk: torch.Tensor,
-    freqs_cos: torch.Tensor,
-    freqs_sin: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Apply rotary embeddings to input tensors using the given frequency tensor without complex numbers.
-
-    Args:
-        xqk (torch.Tensor): Query and/or Key tensors to apply rotary embeddings. Shape: (B, S, *, num_heads, D)
-                            Can be either just query or just key, or both stacked along some batch or * dim.
-        freqs_cos (torch.Tensor): Precomputed cosine frequency tensor.
-        freqs_sin (torch.Tensor): Precomputed sine frequency tensor.
-
-    Returns:
-        torch.Tensor: The input tensor with rotary embeddings applied.
-    """
-    # Split the last dimension into even and odd parts
-    xqk_even = xqk[..., 0::2]
-    xqk_odd = xqk[..., 1::2]
-
-    # Apply rotation
-    cos_part = (xqk_even * freqs_cos - xqk_odd * freqs_sin).type_as(xqk)
-    sin_part = (xqk_even * freqs_sin + xqk_odd * freqs_cos).type_as(xqk)
-
-    # Interleave the results back into the original shape
-    out = torch.stack([cos_part, sin_part], dim=-1).flatten(-2)
-    return out
--- a/comfy/ldm/genmo/joint_model/utils.py
+++ b/comfy/ldm/genmo/joint_model/utils.py
@ -1,102 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-#adapted to ComfyUI
-
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-def modulate(x, shift, scale):
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-
-def pool_tokens(x: torch.Tensor, mask: torch.Tensor, *, keepdim=False) -> torch.Tensor:
-    """
-    Pool tokens in x using mask.
-
-    NOTE: We assume x does not require gradients.
-
-    Args:
-        x: (B, L, D) tensor of tokens.
-        mask: (B, L) boolean tensor indicating which tokens are not padding.
-
-    Returns:
-        pooled: (B, D) tensor of pooled tokens.
-    """
-    assert x.size(1) == mask.size(1)  # Expected mask to have same length as tokens.
-    assert x.size(0) == mask.size(0)  # Expected mask to have same batch size as tokens.
-    mask = mask[:, :, None].to(dtype=x.dtype)
-    mask = mask / mask.sum(dim=1, keepdim=True).clamp(min=1)
-    pooled = (x * mask).sum(dim=1, keepdim=keepdim)
-    return pooled
-
-
-class AttentionPool(nn.Module):
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        output_dim: int = None,
-        device: Optional[torch.device] = None,
-        dtype=None,
-        operations=None,
-    ):
-        """
-        Args:
-            spatial_dim (int): Number of tokens in sequence length.
-            embed_dim (int): Dimensionality of input tokens.
-            num_heads (int): Number of attention heads.
-            output_dim (int): Dimensionality of output tokens. Defaults to embed_dim.
-        """
-        super().__init__()
-        self.num_heads = num_heads
-        self.to_kv = operations.Linear(embed_dim, 2 * embed_dim, device=device, dtype=dtype)
-        self.to_q = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
-        self.to_out = operations.Linear(embed_dim, output_dim or embed_dim, device=device, dtype=dtype)
-
-    def forward(self, x, mask):
-        """
-        Args:
-            x (torch.Tensor): (B, L, D) tensor of input tokens.
-            mask (torch.Tensor): (B, L) boolean tensor indicating which tokens are not padding.
-
-        NOTE: We assume x does not require gradients.
-
-        Returns:
-            x (torch.Tensor): (B, D) tensor of pooled tokens.
-        """
-        D = x.size(2)
-
-        # Construct attention mask, shape: (B, 1, num_queries=1, num_keys=1+L).
-        attn_mask = mask[:, None, None, :].bool()  # (B, 1, 1, L).
-        attn_mask = F.pad(attn_mask, (1, 0), value=True)  # (B, 1, 1, 1+L).
-
-        # Average non-padding token features. These will be used as the query.
-        x_pool = pool_tokens(x, mask, keepdim=True)  # (B, 1, D)
-
-        # Concat pooled features to input sequence.
-        x = torch.cat([x_pool, x], dim=1)  # (B, L+1, D)
-
-        # Compute queries, keys, values. Only the mean token is used to create a query.
-        kv = self.to_kv(x)  # (B, L+1, 2 * D)
-        q = self.to_q(x[:, 0])  # (B, D)
-
-        # Extract heads.
-        head_dim = D // self.num_heads
-        kv = kv.unflatten(2, (2, self.num_heads, head_dim))  # (B, 1+L, 2, H, head_dim)
-        kv = kv.transpose(1, 3)  # (B, H, 2, 1+L, head_dim)
-        k, v = kv.unbind(2)  # (B, H, 1+L, head_dim)
-        q = q.unflatten(1, (self.num_heads, head_dim))  # (B, H, head_dim)
-        q = q.unsqueeze(2)  # (B, H, 1, head_dim)
-
-        # Compute attention.
-        x = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, dropout_p=0.0
-        )  # (B, H, 1, head_dim)
-
-        # Concatenate heads and run output.
-        x = x.squeeze(2).flatten(1, 2)  # (B, D = H * head_dim)
-        x = self.to_out(x)
-        return x
--- a/comfy/ldm/genmo/vae/model.py
+++ b/comfy/ldm/genmo/vae/model.py
@ -1,711 +0,0 @@
-#original code from https://github.com/genmoai/models under apache 2.0 license
-#adapted to ComfyUI
-
-from typing import List, Optional, Tuple, Union
-from functools import partial
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-
-from comfy.ldm.modules.attention import optimized_attention
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-# import mochi_preview.dit.joint_model.context_parallel as cp
-# from mochi_preview.vae.cp_conv import cp_pass_frames, gather_all_frames
-
-
-def cast_tuple(t, length=1):
-    return t if isinstance(t, tuple) else ((t,) * length)
-
-
-class GroupNormSpatial(ops.GroupNorm):
-    """
-    GroupNorm applied per-frame.
-    """
-
-    def forward(self, x: torch.Tensor, *, chunk_size: int = 8):
-        B, C, T, H, W = x.shape
-        x = rearrange(x, "B C T H W -> (B T) C H W")
-        # Run group norm in chunks.
-        output = torch.empty_like(x)
-        for b in range(0, B * T, chunk_size):
-            output[b : b + chunk_size] = super().forward(x[b : b + chunk_size])
-        return rearrange(output, "(B T) C H W -> B C T H W", B=B, T=T)
-
-class PConv3d(ops.Conv3d):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size: Union[int, Tuple[int, int, int]],
-        stride: Union[int, Tuple[int, int, int]],
-        causal: bool = True,
-        context_parallel: bool = True,
-        **kwargs,
-    ):
-        self.causal = causal
-        self.context_parallel = context_parallel
-        kernel_size = cast_tuple(kernel_size, 3)
-        stride = cast_tuple(stride, 3)
-        height_pad = (kernel_size[1] - 1) // 2
-        width_pad = (kernel_size[2] - 1) // 2
-
-        super().__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            dilation=(1, 1, 1),
-            padding=(0, height_pad, width_pad),
-            **kwargs,
-        )
-
-    def forward(self, x: torch.Tensor):
-        # Compute padding amounts.
-        context_size = self.kernel_size[0] - 1
-        if self.causal:
-            pad_front = context_size
-            pad_back = 0
-        else:
-            pad_front = context_size // 2
-            pad_back = context_size - pad_front
-
-        # Apply padding.
-        assert self.padding_mode == "replicate"  # DEBUG
-        mode = "constant" if self.padding_mode == "zeros" else self.padding_mode
-        x = F.pad(x, (0, 0, 0, 0, pad_front, pad_back), mode=mode)
-        return super().forward(x)
-
-
-class Conv1x1(ops.Linear):
-    """*1x1 Conv implemented with a linear layer."""
-
-    def __init__(self, in_features: int, out_features: int, *args, **kwargs):
-        super().__init__(in_features, out_features, *args, **kwargs)
-
-    def forward(self, x: torch.Tensor):
-        """Forward pass.
-
-        Args:
-            x: Input tensor. Shape: [B, C, *] or [B, *, C].
-
-        Returns:
-            x: Output tensor. Shape: [B, C', *] or [B, *, C'].
-        """
-        x = x.movedim(1, -1)
-        x = super().forward(x)
-        x = x.movedim(-1, 1)
-        return x
-
-
-class DepthToSpaceTime(nn.Module):
-    def __init__(
-        self,
-        temporal_expansion: int,
-        spatial_expansion: int,
-    ):
-        super().__init__()
-        self.temporal_expansion = temporal_expansion
-        self.spatial_expansion = spatial_expansion
-
-    # When printed, this module should show the temporal and spatial expansion factors.
-    def extra_repr(self):
-        return f"texp={self.temporal_expansion}, sexp={self.spatial_expansion}"
-
-    def forward(self, x: torch.Tensor):
-        """Forward pass.
-
-        Args:
-            x: Input tensor. Shape: [B, C, T, H, W].
-
-        Returns:
-            x: Rearranged tensor. Shape: [B, C/(st*s*s), T*st, H*s, W*s].
-        """
-        x = rearrange(
-            x,
-            "B (C st sh sw) T H W -> B C (T st) (H sh) (W sw)",
-            st=self.temporal_expansion,
-            sh=self.spatial_expansion,
-            sw=self.spatial_expansion,
-        )
-
-        # cp_rank, _ = cp.get_cp_rank_size()
-        if self.temporal_expansion > 1: # and cp_rank == 0:
-            # Drop the first self.temporal_expansion - 1 frames.
-            # This is because we always want the 3x3x3 conv filter to only apply
-            # to the first frame, and the first frame doesn't need to be repeated.
-            assert all(x.shape)
-            x = x[:, :, self.temporal_expansion - 1 :]
-            assert all(x.shape)
-
-        return x
-
-
-def norm_fn(
-    in_channels: int,
-    affine: bool = True,
-):
-    return GroupNormSpatial(affine=affine, num_groups=32, num_channels=in_channels)
-
-
-class ResBlock(nn.Module):
-    """Residual block that preserves the spatial dimensions."""
-
-    def __init__(
-        self,
-        channels: int,
-        *,
-        affine: bool = True,
-        attn_block: Optional[nn.Module] = None,
-        causal: bool = True,
-        prune_bottleneck: bool = False,
-        padding_mode: str,
-        bias: bool = True,
-    ):
-        super().__init__()
-        self.channels = channels
-
-        assert causal
-        self.stack = nn.Sequential(
-            norm_fn(channels, affine=affine),
-            nn.SiLU(inplace=True),
-            PConv3d(
-                in_channels=channels,
-                out_channels=channels // 2 if prune_bottleneck else channels,
-                kernel_size=(3, 3, 3),
-                stride=(1, 1, 1),
-                padding_mode=padding_mode,
-                bias=bias,
-                causal=causal,
-            ),
-            norm_fn(channels, affine=affine),
-            nn.SiLU(inplace=True),
-            PConv3d(
-                in_channels=channels // 2 if prune_bottleneck else channels,
-                out_channels=channels,
-                kernel_size=(3, 3, 3),
-                stride=(1, 1, 1),
-                padding_mode=padding_mode,
-                bias=bias,
-                causal=causal,
-            ),
-        )
-
-        self.attn_block = attn_block if attn_block else nn.Identity()
-
-    def forward(self, x: torch.Tensor):
-        """Forward pass.
-
-        Args:
-            x: Input tensor. Shape: [B, C, T, H, W].
-        """
-        residual = x
-        x = self.stack(x)
-        x = x + residual
-        del residual
-
-        return self.attn_block(x)
-
-
-class Attention(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        head_dim: int = 32,
-        qkv_bias: bool = False,
-        out_bias: bool = True,
-        qk_norm: bool = True,
-    ) -> None:
-        super().__init__()
-        self.head_dim = head_dim
-        self.num_heads = dim // head_dim
-        self.qk_norm = qk_norm
-
-        self.qkv = nn.Linear(dim, 3 * dim, bias=qkv_bias)
-        self.out = nn.Linear(dim, dim, bias=out_bias)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        """Compute temporal self-attention.
-
-        Args:
-            x: Input tensor. Shape: [B, C, T, H, W].
-            chunk_size: Chunk size for large tensors.
-
-        Returns:
-            x: Output tensor. Shape: [B, C, T, H, W].
-        """
-        B, _, T, H, W = x.shape
-
-        if T == 1:
-            # No attention for single frame.
-            x = x.movedim(1, -1)  # [B, C, T, H, W] -> [B, T, H, W, C]
-            qkv = self.qkv(x)
-            _, _, x = qkv.chunk(3, dim=-1)  # Throw away queries and keys.
-            x = self.out(x)
-            return x.movedim(-1, 1)  # [B, T, H, W, C] -> [B, C, T, H, W]
-
-        # 1D temporal attention.
-        x = rearrange(x, "B C t h w -> (B h w) t C")
-        qkv = self.qkv(x)
-
-        # Input: qkv with shape [B, t, 3 * num_heads * head_dim]
-        # Output: x with shape [B, num_heads, t, head_dim]
-        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, self.head_dim).transpose(1, 3).unbind(2)
-
-        if self.qk_norm:
-            q = F.normalize(q, p=2, dim=-1)
-            k = F.normalize(k, p=2, dim=-1)
-
-        x = optimized_attention(q, k, v, self.num_heads, skip_reshape=True)
-
-        assert x.size(0) == q.size(0)
-
-        x = self.out(x)
-        x = rearrange(x, "(B h w) t C -> B C t h w", B=B, h=H, w=W)
-        return x
-
-
-class AttentionBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        **attn_kwargs,
-    ) -> None:
-        super().__init__()
-        self.norm = norm_fn(dim)
-        self.attn = Attention(dim, **attn_kwargs)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x + self.attn(self.norm(x))
-
-
-class CausalUpsampleBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_res_blocks: int,
-        *,
-        temporal_expansion: int = 2,
-        spatial_expansion: int = 2,
-        **block_kwargs,
-    ):
-        super().__init__()
-
-        blocks = []
-        for _ in range(num_res_blocks):
-            blocks.append(block_fn(in_channels, **block_kwargs))
-        self.blocks = nn.Sequential(*blocks)
-
-        self.temporal_expansion = temporal_expansion
-        self.spatial_expansion = spatial_expansion
-
-        # Change channels in the final convolution layer.
-        self.proj = Conv1x1(
-            in_channels,
-            out_channels * temporal_expansion * (spatial_expansion**2),
-        )
-
-        self.d2st = DepthToSpaceTime(
-            temporal_expansion=temporal_expansion, spatial_expansion=spatial_expansion
-        )
-
-    def forward(self, x):
-        x = self.blocks(x)
-        x = self.proj(x)
-        x = self.d2st(x)
-        return x
-
-
-def block_fn(channels, *, affine: bool = True, has_attention: bool = False, **block_kwargs):
-    attn_block = AttentionBlock(channels) if has_attention else None
-    return ResBlock(channels, affine=affine, attn_block=attn_block, **block_kwargs)
-
-
-class DownsampleBlock(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_res_blocks,
-        *,
-        temporal_reduction=2,
-        spatial_reduction=2,
-        **block_kwargs,
-    ):
-        """
-        Downsample block for the VAE encoder.
-
-        Args:
-            in_channels: Number of input channels.
-            out_channels: Number of output channels.
-            num_res_blocks: Number of residual blocks.
-            temporal_reduction: Temporal reduction factor.
-            spatial_reduction: Spatial reduction factor.
-        """
-        super().__init__()
-        layers = []
-
-        # Change the channel count in the strided convolution.
-        # This lets the ResBlock have uniform channel count,
-        # as in ConvNeXt.
-        assert in_channels != out_channels
-        layers.append(
-            PConv3d(
-                in_channels=in_channels,
-                out_channels=out_channels,
-                kernel_size=(temporal_reduction, spatial_reduction, spatial_reduction),
-                stride=(temporal_reduction, spatial_reduction, spatial_reduction),
-                # First layer in each block always uses replicate padding
-                padding_mode="replicate",
-                bias=block_kwargs["bias"],
-            )
-        )
-
-        for _ in range(num_res_blocks):
-            layers.append(block_fn(out_channels, **block_kwargs))
-
-        self.layers = nn.Sequential(*layers)
-
-    def forward(self, x):
-        return self.layers(x)
-
-
-def add_fourier_features(inputs: torch.Tensor, start=6, stop=8, step=1):
-    num_freqs = (stop - start) // step
-    assert inputs.ndim == 5
-    C = inputs.size(1)
-
-    # Create Base 2 Fourier features.
-    freqs = torch.arange(start, stop, step, dtype=inputs.dtype, device=inputs.device)
-    assert num_freqs == len(freqs)
-    w = torch.pow(2.0, freqs) * (2 * torch.pi)  # [num_freqs]
-    C = inputs.shape[1]
-    w = w.repeat(C)[None, :, None, None, None]  # [1, C * num_freqs, 1, 1, 1]
-
-    # Interleaved repeat of input channels to match w.
-    h = inputs.repeat_interleave(num_freqs, dim=1)  # [B, C * num_freqs, T, H, W]
-    # Scale channels by frequency.
-    h = w * h
-
-    return torch.cat(
-        [
-            inputs,
-            torch.sin(h),
-            torch.cos(h),
-        ],
-        dim=1,
-    )
-
-
-class FourierFeatures(nn.Module):
-    def __init__(self, start: int = 6, stop: int = 8, step: int = 1):
-        super().__init__()
-        self.start = start
-        self.stop = stop
-        self.step = step
-
-    def forward(self, inputs):
-        """Add Fourier features to inputs.
-
-        Args:
-            inputs: Input tensor. Shape: [B, C, T, H, W]
-
-        Returns:
-            h: Output tensor. Shape: [B, (1 + 2 * num_freqs) * C, T, H, W]
-        """
-        return add_fourier_features(inputs, self.start, self.stop, self.step)
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        out_channels: int = 3,
-        latent_dim: int,
-        base_channels: int,
-        channel_multipliers: List[int],
-        num_res_blocks: List[int],
-        temporal_expansions: Optional[List[int]] = None,
-        spatial_expansions: Optional[List[int]] = None,
-        has_attention: List[bool],
-        output_norm: bool = True,
-        nonlinearity: str = "silu",
-        output_nonlinearity: str = "silu",
-        causal: bool = True,
-        **block_kwargs,
-    ):
-        super().__init__()
-        self.input_channels = latent_dim
-        self.base_channels = base_channels
-        self.channel_multipliers = channel_multipliers
-        self.num_res_blocks = num_res_blocks
-        self.output_nonlinearity = output_nonlinearity
-        assert nonlinearity == "silu"
-        assert causal
-
-        ch = [mult * base_channels for mult in channel_multipliers]
-        self.num_up_blocks = len(ch) - 1
-        assert len(num_res_blocks) == self.num_up_blocks + 2
-
-        blocks = []
-
-        first_block = [
-            ops.Conv3d(latent_dim, ch[-1], kernel_size=(1, 1, 1))
-        ]  # Input layer.
-        # First set of blocks preserve channel count.
-        for _ in range(num_res_blocks[-1]):
-            first_block.append(
-                block_fn(
-                    ch[-1],
-                    has_attention=has_attention[-1],
-                    causal=causal,
-                    **block_kwargs,
-                )
-            )
-        blocks.append(nn.Sequential(*first_block))
-
-        assert len(temporal_expansions) == len(spatial_expansions) == self.num_up_blocks
-        assert len(num_res_blocks) == len(has_attention) == self.num_up_blocks + 2
-
-        upsample_block_fn = CausalUpsampleBlock
-
-        for i in range(self.num_up_blocks):
-            block = upsample_block_fn(
-                ch[-i - 1],
-                ch[-i - 2],
-                num_res_blocks=num_res_blocks[-i - 2],
-                has_attention=has_attention[-i - 2],
-                temporal_expansion=temporal_expansions[-i - 1],
-                spatial_expansion=spatial_expansions[-i - 1],
-                causal=causal,
-                **block_kwargs,
-            )
-            blocks.append(block)
-
-        assert not output_norm
-
-        # Last block. Preserve channel count.
-        last_block = []
-        for _ in range(num_res_blocks[0]):
-            last_block.append(
-                block_fn(
-                    ch[0], has_attention=has_attention[0], causal=causal, **block_kwargs
-                )
-            )
-        blocks.append(nn.Sequential(*last_block))
-
-        self.blocks = nn.ModuleList(blocks)
-        self.output_proj = Conv1x1(ch[0], out_channels)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x: Latent tensor. Shape: [B, input_channels, t, h, w]. Scaled [-1, 1].
-
-        Returns:
-            x: Reconstructed video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1].
-               T + 1 = (t - 1) * 4.
-               H = h * 16, W = w * 16.
-        """
-        for block in self.blocks:
-            x = block(x)
-
-        if self.output_nonlinearity == "silu":
-            x = F.silu(x, inplace=not self.training)
-        else:
-            assert (
-                not self.output_nonlinearity
-            )  # StyleGAN3 omits the to-RGB nonlinearity.
-
-        return self.output_proj(x).contiguous()
-
-class LatentDistribution:
-    def __init__(self, mean: torch.Tensor, logvar: torch.Tensor):
-        """Initialize latent distribution.
-
-        Args:
-            mean: Mean of the distribution. Shape: [B, C, T, H, W].
-            logvar: Logarithm of variance of the distribution. Shape: [B, C, T, H, W].
-        """
-        assert mean.shape == logvar.shape
-        self.mean = mean
-        self.logvar = logvar
-
-    def sample(self, temperature=1.0, generator: torch.Generator = None, noise=None):
-        if temperature == 0.0:
-            return self.mean
-
-        if noise is None:
-            noise = torch.randn(self.mean.shape, device=self.mean.device, dtype=self.mean.dtype, generator=generator)
-        else:
-            assert noise.device == self.mean.device
-            noise = noise.to(self.mean.dtype)
-
-        if temperature != 1.0:
-            raise NotImplementedError(f"Temperature {temperature} is not supported.")
-
-        # Just Gaussian sample with no scaling of variance.
-        return noise * torch.exp(self.logvar * 0.5) + self.mean
-
-    def mode(self):
-        return self.mean
-
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels: int,
-        base_channels: int,
-        channel_multipliers: List[int],
-        num_res_blocks: List[int],
-        latent_dim: int,
-        temporal_reductions: List[int],
-        spatial_reductions: List[int],
-        prune_bottlenecks: List[bool],
-        has_attentions: List[bool],
-        affine: bool = True,
-        bias: bool = True,
-        input_is_conv_1x1: bool = False,
-        padding_mode: str,
-    ):
-        super().__init__()
-        self.temporal_reductions = temporal_reductions
-        self.spatial_reductions = spatial_reductions
-        self.base_channels = base_channels
-        self.channel_multipliers = channel_multipliers
-        self.num_res_blocks = num_res_blocks
-        self.latent_dim = latent_dim
-
-        self.fourier_features = FourierFeatures()
-        ch = [mult * base_channels for mult in channel_multipliers]
-        num_down_blocks = len(ch) - 1
-        assert len(num_res_blocks) == num_down_blocks + 2
-
-        layers = (
-            [ops.Conv3d(in_channels, ch[0], kernel_size=(1, 1, 1), bias=True)]
-            if not input_is_conv_1x1
-            else [Conv1x1(in_channels, ch[0])]
-        )
-
-        assert len(prune_bottlenecks) == num_down_blocks + 2
-        assert len(has_attentions) == num_down_blocks + 2
-        block = partial(block_fn, padding_mode=padding_mode, affine=affine, bias=bias)
-
-        for _ in range(num_res_blocks[0]):
-            layers.append(block(ch[0], has_attention=has_attentions[0], prune_bottleneck=prune_bottlenecks[0]))
-        prune_bottlenecks = prune_bottlenecks[1:]
-        has_attentions = has_attentions[1:]
-
-        assert len(temporal_reductions) == len(spatial_reductions) == len(ch) - 1
-        for i in range(num_down_blocks):
-            layer = DownsampleBlock(
-                ch[i],
-                ch[i + 1],
-                num_res_blocks=num_res_blocks[i + 1],
-                temporal_reduction=temporal_reductions[i],
-                spatial_reduction=spatial_reductions[i],
-                prune_bottleneck=prune_bottlenecks[i],
-                has_attention=has_attentions[i],
-                affine=affine,
-                bias=bias,
-                padding_mode=padding_mode,
-            )
-
-            layers.append(layer)
-
-        # Additional blocks.
-        for _ in range(num_res_blocks[-1]):
-            layers.append(block(ch[-1], has_attention=has_attentions[-1], prune_bottleneck=prune_bottlenecks[-1]))
-
-        self.layers = nn.Sequential(*layers)
-
-        # Output layers.
-        self.output_norm = norm_fn(ch[-1])
-        self.output_proj = Conv1x1(ch[-1], 2 * latent_dim, bias=False)
-
-    @property
-    def temporal_downsample(self):
-        return math.prod(self.temporal_reductions)
-
-    @property
-    def spatial_downsample(self):
-        return math.prod(self.spatial_reductions)
-
-    def forward(self, x) -> LatentDistribution:
-        """Forward pass.
-
-        Args:
-            x: Input video tensor. Shape: [B, C, T, H, W]. Scaled to [-1, 1]
-
-        Returns:
-            means: Latent tensor. Shape: [B, latent_dim, t, h, w]. Scaled [-1, 1].
-                   h = H // 8, w = W // 8, t - 1 = (T - 1) // 6
-            logvar: Shape: [B, latent_dim, t, h, w].
-        """
-        assert x.ndim == 5, f"Expected 5D input, got {x.shape}"
-        x = self.fourier_features(x)
-
-        x = self.layers(x)
-
-        x = self.output_norm(x)
-        x = F.silu(x, inplace=True)
-        x = self.output_proj(x)
-
-        means, logvar = torch.chunk(x, 2, dim=1)
-
-        assert means.ndim == 5
-        assert logvar.shape == means.shape
-        assert means.size(1) == self.latent_dim
-
-        return LatentDistribution(means, logvar)
-
-
-class VideoVAE(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.encoder = Encoder(
-            in_channels=15,
-            base_channels=64,
-            channel_multipliers=[1, 2, 4, 6],
-            num_res_blocks=[3, 3, 4, 6, 3],
-            latent_dim=12,
-            temporal_reductions=[1, 2, 3],
-            spatial_reductions=[2, 2, 2],
-            prune_bottlenecks=[False, False, False, False, False],
-            has_attentions=[False, True, True, True, True],
-            affine=True,
-            bias=True,
-            input_is_conv_1x1=True,
-            padding_mode="replicate"
-        )
-        self.decoder = Decoder(
-            out_channels=3,
-            base_channels=128,
-            channel_multipliers=[1, 2, 4, 6],
-            temporal_expansions=[1, 2, 3],
-            spatial_expansions=[2, 2, 2],
-            num_res_blocks=[3, 3, 4, 6, 3],
-            latent_dim=12,
-            has_attention=[False, False, False, False, False],
-            padding_mode="replicate",
-            output_norm=False,
-            nonlinearity="silu",
-            output_nonlinearity="silu",
-            causal=True,
-        )
-
-    def encode(self, x):
-        return self.encoder(x).mode()
-
-    def decode(self, x):
-        return self.decoder(x)
--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@ -1,799 +0,0 @@
-from typing import Optional, Tuple, List
-
-import torch
-import torch.nn as nn
-import einops
-from einops import repeat
-
-from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
-import torch.nn.functional as F
-
-from comfy.ldm.flux.math import apply_rope, rope
-from comfy.ldm.flux.layers import LastLayer
-
-from comfy.ldm.modules.attention import optimized_attention
-import comfy.model_management
-import comfy.ldm.common_dit
-
-
-# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
-class EmbedND(nn.Module):
-    def __init__(self, theta: int, axes_dim: List[int]):
-        super().__init__()
-        self.theta = theta
-        self.axes_dim = axes_dim
-
-    def forward(self, ids: torch.Tensor) -> torch.Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(2)
-
-
-class PatchEmbed(nn.Module):
-    def __init__(
-        self,
-        patch_size=2,
-        in_channels=4,
-        out_channels=1024,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.patch_size = patch_size
-        self.out_channels = out_channels
-        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
-
-    def forward(self, latent):
-        latent = self.proj(latent)
-        return latent
-
-
-class PooledEmbed(nn.Module):
-    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, pooled_embed):
-        return self.pooled_embedder(pooled_embed)
-
-
-class TimestepEmbed(nn.Module):
-    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, timesteps, wdtype):
-        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
-        t_emb = self.timestep_embedder(t_emb)
-        return t_emb
-
-
-def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
-    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
-
-
-class HiDreamAttnProcessor_flashattn:
-    """Attention processor used typically in processing the SD3-like self-attention projections."""
-
-    def __call__(
-        self,
-        attn,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-        *args,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        dtype = image_tokens.dtype
-        batch_size = image_tokens.shape[0]
-
-        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
-        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
-        value_i = attn.to_v(image_tokens)
-
-        inner_dim = key_i.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
-        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
-        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
-        if image_tokens_masks is not None:
-            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
-
-        if not attn.single:
-            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
-            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
-            value_t = attn.to_v_t(text_tokens)
-
-            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
-            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
-            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
-
-            num_image_tokens = query_i.shape[1]
-            num_text_tokens = query_t.shape[1]
-            query = torch.cat([query_i, query_t], dim=1)
-            key = torch.cat([key_i, key_t], dim=1)
-            value = torch.cat([value_i, value_t], dim=1)
-        else:
-            query = query_i
-            key = key_i
-            value = value_i
-
-        if query.shape[-1] == rope.shape[-3] * 2:
-            query, key = apply_rope(query, key, rope)
-        else:
-            query_1, query_2 = query.chunk(2, dim=-1)
-            key_1, key_2 = key.chunk(2, dim=-1)
-            query_1, key_1 = apply_rope(query_1, key_1, rope)
-            query = torch.cat([query_1, query_2], dim=-1)
-            key = torch.cat([key_1, key_2], dim=-1)
-
-        hidden_states = attention(query, key, value)
-
-        if not attn.single:
-            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
-            hidden_states_i = attn.to_out(hidden_states_i)
-            hidden_states_t = attn.to_out_t(hidden_states_t)
-            return hidden_states_i, hidden_states_t
-        else:
-            hidden_states = attn.to_out(hidden_states)
-            return hidden_states
-
-class HiDreamAttention(nn.Module):
-    def __init__(
-        self,
-        query_dim: int,
-        heads: int = 8,
-        dim_head: int = 64,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        scale_qk: bool = True,
-        eps: float = 1e-5,
-        processor = None,
-        out_dim: int = None,
-        single: bool = False,
-        dtype=None, device=None, operations=None
-    ):
-        # super(Attention, self).__init__()
-        super().__init__()
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
-        self.query_dim = query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-        self.out_dim = out_dim if out_dim is not None else query_dim
-
-        self.scale_qk = scale_qk
-        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
-
-        self.heads = out_dim // dim_head if out_dim is not None else heads
-        self.sliceable_head_dim = heads
-        self.single = single
-
-        linear_cls = operations.Linear
-        self.linear_cls = linear_cls
-        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        if not single:
-            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
-            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
-            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
-
-        self.processor = processor
-
-    def forward(
-        self,
-        norm_image_tokens: torch.FloatTensor,
-        image_tokens_masks: torch.FloatTensor = None,
-        norm_text_tokens: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.Tensor:
-        return self.processor(
-            self,
-            image_tokens = norm_image_tokens,
-            image_tokens_masks = image_tokens_masks,
-            text_tokens = norm_text_tokens,
-            rope = rope,
-        )
-
-
-class FeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-
-        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
-        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, x):
-        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MoEGate(nn.Module):
-    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.top_k = num_activated_experts
-        self.n_routed_experts = num_routed_experts
-
-        self.scoring_func = 'softmax'
-        self.alpha = aux_loss_alpha
-        self.seq_aux = False
-
-        # topk selection algorithm
-        self.norm_topk_prob = False
-        self.gating_dim = embed_dim
-        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
-        self.reset_parameters()
-
-    def reset_parameters(self) -> None:
-        pass
-        # import torch.nn.init  as init
-        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-
-    def forward(self, hidden_states):
-        bsz, seq_len, h = hidden_states.shape
-
-        ### compute gating score
-        hidden_states = hidden_states.view(-1, h)
-        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
-        if self.scoring_func == 'softmax':
-            scores = logits.softmax(dim=-1)
-        else:
-            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
-
-        ### select top-k experts
-        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
-
-        ### norm gate to sum 1
-        if self.top_k > 1 and self.norm_topk_prob:
-            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
-            topk_weight = topk_weight / denominator
-
-        aux_loss = None
-        return topk_idx, topk_weight, aux_loss
-
-
-# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
-class MOEFeedForwardSwiGLU(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        num_routed_experts: int,
-        num_activated_experts: int,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
-        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
-        self.gate = MoEGate(
-            embed_dim = dim,
-            num_routed_experts = num_routed_experts,
-            num_activated_experts = num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.num_activated_experts = num_activated_experts
-
-    def forward(self, x):
-        wtype = x.dtype
-        identity = x
-        orig_shape = x.shape
-        topk_idx, topk_weight, aux_loss = self.gate(x)
-        x = x.view(-1, x.shape[-1])
-        flat_topk_idx = topk_idx.view(-1)
-        if True:  # self.training: # TODO: check which branch performs faster
-            x = x.repeat_interleave(self.num_activated_experts, dim=0)
-            y = torch.empty_like(x, dtype=wtype)
-            for i, expert in enumerate(self.experts):
-                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
-            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
-            y =  y.view(*orig_shape).to(dtype=wtype)
-            #y = AddAuxiliaryLoss.apply(y, aux_loss)
-        else:
-            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
-        y = y + self.shared_experts(identity)
-        return y
-
-    @torch.no_grad()
-    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
-        expert_cache = torch.zeros_like(x)
-        idxs = flat_expert_indices.argsort()
-        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
-        token_idxs = idxs // self.num_activated_experts
-        for i, end_idx in enumerate(tokens_per_expert):
-            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
-            if start_idx == end_idx:
-                continue
-            expert = self.experts[i]
-            exp_token_idx = token_idxs[start_idx:end_idx]
-            expert_tokens = x[exp_token_idx]
-            expert_out = expert(expert_tokens)
-            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
-
-            # for fp16 and other dtype
-            expert_cache = expert_cache.to(expert_out.dtype)
-            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
-        return expert_cache
-
-
-class TextProjection(nn.Module):
-    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
-
-    def forward(self, caption):
-        hidden_states = self.linear(caption)
-        return hidden_states
-
-
-class BlockType:
-    TransformerBlock = 1
-    SingleTransformerBlock = 2
-
-
-class HiDreamImageSingleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
-        )
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = True,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        attn_output_i = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            rope = rope,
-        )
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
-        image_tokens = ff_output_i + image_tokens
-        return image_tokens
-
-
-class HiDreamImageTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        self.num_attention_heads = num_attention_heads
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
-        )
-        # nn.init.zeros_(self.adaLN_modulation[1].weight)
-        # nn.init.zeros_(self.adaLN_modulation[1].bias)
-
-        # 1. Attention
-        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        self.attn1 = HiDreamAttention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            processor = HiDreamAttnProcessor_flashattn(),
-            single = False,
-            dtype=dtype, device=device, operations=operations
-        )
-
-        # 3. Feed-forward
-        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
-        if num_routed_experts > 0:
-            self.ff_i = MOEFeedForwardSwiGLU(
-                dim = dim,
-                hidden_dim = 4 * dim,
-                num_routed_experts = num_routed_experts,
-                num_activated_experts = num_activated_experts,
-                dtype=dtype, device=device, operations=operations
-            )
-        else:
-            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
-        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: Optional[torch.FloatTensor] = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        wtype = image_tokens.dtype
-        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
-        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
-            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
-
-        # 1. MM-Attention
-        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
-        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
-
-        attn_output_i, attn_output_t = self.attn1(
-            norm_image_tokens,
-            image_tokens_masks,
-            norm_text_tokens,
-            rope = rope,
-        )
-
-        image_tokens = gate_msa_i * attn_output_i + image_tokens
-        text_tokens = gate_msa_t * attn_output_t + text_tokens
-
-        # 2. Feed-forward
-        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
-        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
-        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
-        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
-
-        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
-        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
-        image_tokens = ff_output_i + image_tokens
-        text_tokens = ff_output_t + text_tokens
-        return image_tokens, text_tokens
-
-
-class HiDreamImageBlock(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        block_type: BlockType = BlockType.TransformerBlock,
-        dtype=None, device=None, operations=None
-    ):
-        super().__init__()
-        block_classes = {
-            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
-            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
-        }
-        self.block = block_classes[block_type](
-            dim,
-            num_attention_heads,
-            attention_head_dim,
-            num_routed_experts,
-            num_activated_experts,
-            dtype=dtype, device=device, operations=operations
-        )
-
-    def forward(
-        self,
-        image_tokens: torch.FloatTensor,
-        image_tokens_masks: Optional[torch.FloatTensor] = None,
-        text_tokens: Optional[torch.FloatTensor] = None,
-        adaln_input: torch.FloatTensor = None,
-        rope: torch.FloatTensor = None,
-    ) -> torch.FloatTensor:
-        return self.block(
-            image_tokens,
-            image_tokens_masks,
-            text_tokens,
-            adaln_input,
-            rope,
-        )
-
-
-class HiDreamImageTransformer2DModel(nn.Module):
-    def __init__(
-        self,
-        patch_size: Optional[int] = None,
-        in_channels: int = 64,
-        out_channels: Optional[int] = None,
-        num_layers: int = 16,
-        num_single_layers: int = 32,
-        attention_head_dim: int = 128,
-        num_attention_heads: int = 20,
-        caption_channels: List[int] = None,
-        text_emb_dim: int = 2048,
-        num_routed_experts: int = 4,
-        num_activated_experts: int = 2,
-        axes_dims_rope: Tuple[int, int] = (32, 32),
-        max_resolution: Tuple[int, int] = (128, 128),
-        llama_layers: List[int] = None,
-        image_model=None,
-        dtype=None, device=None, operations=None
-    ):
-        self.patch_size = patch_size
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_dim = attention_head_dim
-        self.num_layers = num_layers
-        self.num_single_layers = num_single_layers
-
-        self.gradient_checkpointing = False
-
-        super().__init__()
-        self.dtype = dtype
-        self.out_channels = out_channels or in_channels
-        self.inner_dim = self.num_attention_heads * self.attention_head_dim
-        self.llama_layers = llama_layers
-
-        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
-        self.x_embedder = PatchEmbed(
-            patch_size = patch_size,
-            in_channels = in_channels,
-            out_channels = self.inner_dim,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
-
-        self.double_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.TransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_layers)
-            ]
-        )
-
-        self.single_stream_blocks = nn.ModuleList(
-            [
-                HiDreamImageBlock(
-                    dim = self.inner_dim,
-                    num_attention_heads = self.num_attention_heads,
-                    attention_head_dim = self.attention_head_dim,
-                    num_routed_experts = num_routed_experts,
-                    num_activated_experts = num_activated_experts,
-                    block_type = BlockType.SingleTransformerBlock,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for i in range(self.num_single_layers)
-            ]
-        )
-
-        self.final_layer = LastLayer(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
-
-        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
-        caption_projection = []
-        for caption_channel in caption_channels:
-            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
-        self.caption_projection = nn.ModuleList(caption_projection)
-        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
-
-    def expand_timesteps(self, timesteps, batch_size, device):
-        if not torch.is_tensor(timesteps):
-            is_mps = device.type == "mps"
-            if isinstance(timesteps, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(device)
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(batch_size)
-        return timesteps
-
-    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
-        x_arr = []
-        for i, img_size in enumerate(img_sizes):
-            pH, pW = img_size
-            x_arr.append(
-                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
-                    p1=self.patch_size, p2=self.patch_size)
-            )
-        x = torch.cat(x_arr, dim=0)
-        return x
-
-    def patchify(self, x, max_seq, img_sizes=None):
-        pz2 = self.patch_size * self.patch_size
-        if isinstance(x, torch.Tensor):
-            B = x.shape[0]
-            device = x.device
-            dtype = x.dtype
-        else:
-            B = len(x)
-            device = x[0].device
-            dtype = x[0].dtype
-        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
-
-        if img_sizes is not None:
-            for i, img_size in enumerate(img_sizes):
-                x_masks[i, 0:img_size[0] * img_size[1]] = 1
-            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
-        elif isinstance(x, torch.Tensor):
-            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
-            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
-            img_sizes = [[pH, pW]] * B
-            x_masks = None
-        else:
-            raise NotImplementedError
-        return x, x_masks, img_sizes
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        t: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        context: Optional[torch.Tensor] = None,
-        encoder_hidden_states_llama3=None,
-        control = None,
-        transformer_options = {},
-    ) -> torch.Tensor:
-        bs, c, h, w = x.shape
-        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
-        timesteps = t
-        pooled_embeds = y
-        T5_encoder_hidden_states = context
-
-        img_sizes = None
-
-        # spatial forward
-        batch_size = hidden_states.shape[0]
-        hidden_states_type = hidden_states.dtype
-
-        # 0. time
-        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
-        timesteps = self.t_embedder(timesteps, hidden_states_type)
-        p_embedder = self.p_embedder(pooled_embeds)
-        adaln_input = timesteps + p_embedder
-
-        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
-        if image_tokens_masks is None:
-            pH, pW = img_sizes[0]
-            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
-            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
-            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
-            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
-        hidden_states = self.x_embedder(hidden_states)
-
-        # T5_encoder_hidden_states = encoder_hidden_states[0]
-        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
-        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
-
-        if self.caption_projection is not None:
-            new_encoder_hidden_states = []
-            for i, enc_hidden_state in enumerate(encoder_hidden_states):
-                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
-                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
-                new_encoder_hidden_states.append(enc_hidden_state)
-            encoder_hidden_states = new_encoder_hidden_states
-            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
-            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-            encoder_hidden_states.append(T5_encoder_hidden_states)
-
-        txt_ids = torch.zeros(
-            batch_size,
-            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
-            3,
-            device=img_ids.device, dtype=img_ids.dtype
-        )
-        ids = torch.cat((img_ids, txt_ids), dim=1)
-        rope = self.pe_embedder(ids)
-
-        # 2. Blocks
-        block_id = 0
-        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
-        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
-        for bid, block in enumerate(self.double_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states, initial_encoder_hidden_states = block(
-                image_tokens = hidden_states,
-                image_tokens_masks = image_tokens_masks,
-                text_tokens = cur_encoder_hidden_states,
-                adaln_input = adaln_input,
-                rope = rope,
-            )
-            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
-            block_id += 1
-
-        image_tokens_seq_len = hidden_states.shape[1]
-        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
-        hidden_states_seq_len = hidden_states.shape[1]
-        if image_tokens_masks is not None:
-            encoder_attention_mask_ones = torch.ones(
-                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
-                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
-            )
-            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
-
-        for bid, block in enumerate(self.single_stream_blocks):
-            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
-            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
-            hidden_states = block(
-                image_tokens=hidden_states,
-                image_tokens_masks=image_tokens_masks,
-                text_tokens=None,
-                adaln_input=adaln_input,
-                rope=rope,
-            )
-            hidden_states = hidden_states[:, :hidden_states_seq_len]
-            block_id += 1
-
-        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
-        output = self.final_layer(hidden_states, adaln_input)
-        output = self.unpatchify(output, img_sizes)
-        return -output[:, :, :h, :w]
--- a/comfy/ldm/hunyuan3d/model.py
+++ b/comfy/ldm/hunyuan3d/model.py
@ -1,135 +0,0 @@
-import torch
-from torch import nn
-from comfy.ldm.flux.layers import (
-    DoubleStreamBlock,
-    LastLayer,
-    MLPEmbedder,
-    SingleStreamBlock,
-    timestep_embedding,
-)
-
-
-class Hunyuan3Dv2(nn.Module):
-    def __init__(
-        self,
-        in_channels=64,
-        context_in_dim=1536,
-        hidden_size=1024,
-        mlp_ratio=4.0,
-        num_heads=16,
-        depth=16,
-        depth_single_blocks=32,
-        qkv_bias=True,
-        guidance_embed=False,
-        image_model=None,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.dtype = dtype
-
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-
-        self.max_period = 1000  # While reimplementing the model I noticed that they messed up. This 1000 value was meant to be the time_factor but they set the max_period instead
-        self.latent_in = operations.Linear(in_channels, hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=hidden_size, dtype=dtype, device=device, operations=operations) if guidance_embed else None
-        )
-        self.cond_in = operations.Linear(context_in_dim, hidden_size, dtype=dtype, device=device)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    qkv_bias=qkv_bias,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    hidden_size,
-                    num_heads,
-                    mlp_ratio=mlp_ratio,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(hidden_size, 1, in_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward(self, x, timestep, context, guidance=None, transformer_options={}, **kwargs):
-        x = x.movedim(-1, -2)
-        timestep = 1.0 - timestep
-        txt = context
-        img = self.latent_in(x)
-
-        vec = self.time_in(timestep_embedding(timestep, 256, self.max_period).to(dtype=img.dtype))
-        if self.guidance_in is not None:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.max_period).to(img.dtype))
-
-        txt = self.cond_in(txt)
-        pe = None
-        attn_mask = None
-
-        patches_replace = transformer_options.get("patches_replace", {})
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"],
-                                                   txt=args["txt"],
-                                                   vec=args["vec"],
-                                                   pe=args["pe"],
-                                                   attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": img,
-                                                           "txt": txt,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                txt = out["txt"]
-                img = out["img"]
-            else:
-                img, txt = block(img=img,
-                                 txt=txt,
-                                 vec=vec,
-                                 pe=pe,
-                                 attn_mask=attn_mask)
-
-        img = torch.cat((txt, img), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if ("single_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"],
-                                       vec=args["vec"],
-                                       pe=args["pe"],
-                                       attn_mask=args.get("attn_mask"))
-                    return out
-
-                out = blocks_replace[("single_block", i)]({"img": img,
-                                                           "vec": vec,
-                                                           "pe": pe,
-                                                           "attn_mask": attn_mask},
-                                                          {"original_block": block_wrap})
-                img = out["img"]
-            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask)
-
-        img = img[:, txt.shape[1]:, ...]
-        img = self.final_layer(img, vec)
-        return img.movedim(-2, -1) * (-1.0)
--- a/comfy/ldm/hunyuan3d/vae.py
+++ b/comfy/ldm/hunyuan3d/vae.py
@ -1,587 +0,0 @@
-# Original: https://github.com/Tencent/Hunyuan3D-2/blob/main/hy3dgen/shapegen/models/autoencoders/model.py
-# Since the header on their VAE source file was a bit confusing we asked for permission to use this code from tencent under the GPL license used in ComfyUI.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-from typing import Union, Tuple, List, Callable, Optional
-
-import numpy as np
-from einops import repeat, rearrange
-from tqdm import tqdm
-import logging
-
-import comfy.ops
-ops = comfy.ops.disable_weight_init
-
-def generate_dense_grid_points(
-    bbox_min: np.ndarray,
-    bbox_max: np.ndarray,
-    octree_resolution: int,
-    indexing: str = "ij",
-):
-    length = bbox_max - bbox_min
-    num_cells = octree_resolution
-
-    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
-    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
-    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
-    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
-    xyz = np.stack((xs, ys, zs), axis=-1)
-    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
-
-    return xyz, grid_size, length
-
-
-class VanillaVolumeDecoder:
-    @torch.no_grad()
-    def __call__(
-        self,
-        latents: torch.FloatTensor,
-        geo_decoder: Callable,
-        bounds: Union[Tuple[float], List[float], float] = 1.01,
-        num_chunks: int = 10000,
-        octree_resolution: int = None,
-        enable_pbar: bool = True,
-        **kwargs,
-    ):
-        device = latents.device
-        dtype = latents.dtype
-        batch_size = latents.shape[0]
-
-        # 1. generate query points
-        if isinstance(bounds, float):
-            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
-
-        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
-        xyz_samples, grid_size, length = generate_dense_grid_points(
-            bbox_min=bbox_min,
-            bbox_max=bbox_max,
-            octree_resolution=octree_resolution,
-            indexing="ij"
-        )
-        xyz_samples = torch.from_numpy(xyz_samples).to(device, dtype=dtype).contiguous().reshape(-1, 3)
-
-        # 2. latents to 3d volume
-        batch_logits = []
-        for start in tqdm(range(0, xyz_samples.shape[0], num_chunks), desc="Volume Decoding",
-                          disable=not enable_pbar):
-            chunk_queries = xyz_samples[start: start + num_chunks, :]
-            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
-            logits = geo_decoder(queries=chunk_queries, latents=latents)
-            batch_logits.append(logits)
-
-        grid_logits = torch.cat(batch_logits, dim=1)
-        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
-
-        return grid_logits
-
-
-class FourierEmbedder(nn.Module):
-    """The sin/cosine positional embedding. Given an input tensor `x` of shape [n_batch, ..., c_dim], it converts
-    each feature dimension of `x[..., i]` into:
-        [
-            sin(x[..., i]),
-            sin(f_1*x[..., i]),
-            sin(f_2*x[..., i]),
-            ...
-            sin(f_N * x[..., i]),
-            cos(x[..., i]),
-            cos(f_1*x[..., i]),
-            cos(f_2*x[..., i]),
-            ...
-            cos(f_N * x[..., i]),
-            x[..., i]     # only present if include_input is True.
-        ], here f_i is the frequency.
-
-    Denote the space is [0 / num_freqs, 1 / num_freqs, 2 / num_freqs, 3 / num_freqs, ..., (num_freqs - 1) / num_freqs].
-    If logspace is True, then the frequency f_i is [2^(0 / num_freqs), ..., 2^(i / num_freqs), ...];
-    Otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)].
-
-    Args:
-        num_freqs (int): the number of frequencies, default is 6;
-        logspace (bool): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-            otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1)];
-        input_dim (int): the input dimension, default is 3;
-        include_input (bool): include the input tensor or not, default is True.
-
-    Attributes:
-        frequencies (torch.Tensor): If logspace is True, then the frequency f_i is [..., 2^(i / num_freqs), ...],
-                otherwise, the frequencies are linearly spaced between [1.0, 2^(num_freqs - 1);
-
-        out_dim (int): the embedding size, if include_input is True, it is input_dim * (num_freqs * 2 + 1),
-            otherwise, it is input_dim * num_freqs * 2.
-
-    """
-
-    def __init__(self,
-                 num_freqs: int = 6,
-                 logspace: bool = True,
-                 input_dim: int = 3,
-                 include_input: bool = True,
-                 include_pi: bool = True) -> None:
-
-        """The initialization"""
-
-        super().__init__()
-
-        if logspace:
-            frequencies = 2.0 ** torch.arange(
-                num_freqs,
-                dtype=torch.float32
-            )
-        else:
-            frequencies = torch.linspace(
-                1.0,
-                2.0 ** (num_freqs - 1),
-                num_freqs,
-                dtype=torch.float32
-            )
-
-        if include_pi:
-            frequencies *= torch.pi
-
-        self.register_buffer("frequencies", frequencies, persistent=False)
-        self.include_input = include_input
-        self.num_freqs = num_freqs
-
-        self.out_dim = self.get_dims(input_dim)
-
-    def get_dims(self, input_dim):
-        temp = 1 if self.include_input or self.num_freqs == 0 else 0
-        out_dim = input_dim * (self.num_freqs * 2 + temp)
-
-        return out_dim
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """ Forward process.
-
-        Args:
-            x: tensor of shape [..., dim]
-
-        Returns:
-            embedding: an embedding of `x` of shape [..., dim * (num_freqs * 2 + temp)]
-                where temp is 1 if include_input is True and 0 otherwise.
-        """
-
-        if self.num_freqs > 0:
-            embed = (x[..., None].contiguous() * self.frequencies.to(device=x.device, dtype=x.dtype)).view(*x.shape[:-1], -1)
-            if self.include_input:
-                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
-            else:
-                return torch.cat((embed.sin(), embed.cos()), dim=-1)
-        else:
-            return x
-
-
-class CrossAttentionProcessor:
-    def __call__(self, attn, q, k, v):
-        out = F.scaled_dot_product_attention(q, k, v)
-        return out
-
-
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-
-    def forward(self, x):
-        """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-
-        This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-        changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-        'survival rate' as the argument.
-
-        """
-        if self.drop_prob == 0. or not self.training:
-            return x
-        keep_prob = 1 - self.drop_prob
-        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-        if keep_prob > 0.0 and self.scale_by_keep:
-            random_tensor.div_(keep_prob)
-        return x * random_tensor
-
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob, 3):0.3f}'
-
-
-class MLP(nn.Module):
-    def __init__(
-        self, *,
-        width: int,
-        expand_ratio: int = 4,
-        output_width: int = None,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.c_fc = ops.Linear(width, width * expand_ratio)
-        self.c_proj = ops.Linear(width * expand_ratio, output_width if output_width is not None else width)
-        self.gelu = nn.GELU()
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        return self.drop_path(self.c_proj(self.gelu(self.c_fc(x))))
-
-
-class QKVMultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-        self.attn_processor = CrossAttentionProcessor()
-
-    def forward(self, q, kv):
-        _, n_ctx, _ = q.shape
-        bs, n_data, width = kv.shape
-        attn_ch = width // self.heads // 2
-        q = q.view(bs, n_ctx, self.heads, -1)
-        kv = kv.view(bs, n_data, self.heads, -1)
-        k, v = torch.split(kv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = self.attn_processor(self, q, k, v)
-        out = out.transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadCrossAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        data_width: Optional[int] = None,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        kv_cache: bool = False,
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.data_width = width if data_width is None else data_width
-        self.c_q = ops.Linear(width, width, bias=qkv_bias)
-        self.c_kv = ops.Linear(self.data_width, width * 2, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadCrossAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.kv_cache = kv_cache
-        self.data = None
-
-    def forward(self, x, data):
-        x = self.c_q(x)
-        if self.kv_cache:
-            if self.data is None:
-                self.data = self.c_kv(data)
-                logging.info('Save kv cache,this should be called only once for one mesh')
-            data = self.data
-        else:
-            data = self.c_kv(data)
-        x = self.attention(x, data)
-        x = self.c_proj(x)
-        return x
-
-
-class ResidualCrossAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        data_width: Optional[int] = None,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False
-    ):
-        super().__init__()
-
-        if data_width is None:
-            data_width = width
-
-        self.attn = MultiheadCrossAttention(
-            width=width,
-            heads=heads,
-            data_width=data_width,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.ln_2 = norm_layer(data_width, elementwise_affine=True, eps=1e-6)
-        self.ln_3 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, expand_ratio=mlp_expand_ratio)
-
-    def forward(self, x: torch.Tensor, data: torch.Tensor):
-        x = x + self.attn(self.ln_1(x), self.ln_2(data))
-        x = x + self.mlp(self.ln_3(x))
-        return x
-
-
-class QKVMultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        heads: int,
-        width=None,
-        qk_norm=False,
-        norm_layer=ops.LayerNorm
-    ):
-        super().__init__()
-        self.heads = heads
-        self.q_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-        self.k_norm = norm_layer(width // heads, elementwise_affine=True, eps=1e-6) if qk_norm else nn.Identity()
-
-    def forward(self, qkv):
-        bs, n_ctx, width = qkv.shape
-        attn_ch = width // self.heads // 3
-        qkv = qkv.view(bs, n_ctx, self.heads, -1)
-        q, k, v = torch.split(qkv, attn_ch, dim=-1)
-
-        q = self.q_norm(q)
-        k = self.k_norm(k)
-
-        q, k, v = map(lambda t: rearrange(t, 'b n h d -> b h n d', h=self.heads), (q, k, v))
-        out = F.scaled_dot_product_attention(q, k, v).transpose(1, 2).reshape(bs, n_ctx, -1)
-        return out
-
-
-class MultiheadAttention(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.heads = heads
-        self.c_qkv = ops.Linear(width, width * 3, bias=qkv_bias)
-        self.c_proj = ops.Linear(width, width)
-        self.attention = QKVMultiheadAttention(
-            heads=heads,
-            width=width,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm
-        )
-        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-
-    def forward(self, x):
-        x = self.c_qkv(x)
-        x = self.attention(x)
-        x = self.drop_path(self.c_proj(x))
-        return x
-
-
-class ResidualAttentionBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0,
-    ):
-        super().__init__()
-        self.attn = MultiheadAttention(
-            width=width,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            norm_layer=norm_layer,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-        self.ln_1 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-        self.mlp = MLP(width=width, drop_path_rate=drop_path_rate)
-        self.ln_2 = norm_layer(width, elementwise_affine=True, eps=1e-6)
-
-    def forward(self, x: torch.Tensor):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(
-        self,
-        *,
-        width: int,
-        layers: int,
-        heads: int,
-        qkv_bias: bool = True,
-        norm_layer=ops.LayerNorm,
-        qk_norm: bool = False,
-        drop_path_rate: float = 0.0
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-        self.resblocks = nn.ModuleList(
-            [
-                ResidualAttentionBlock(
-                    width=width,
-                    heads=heads,
-                    qkv_bias=qkv_bias,
-                    norm_layer=norm_layer,
-                    qk_norm=qk_norm,
-                    drop_path_rate=drop_path_rate
-                )
-                for _ in range(layers)
-            ]
-        )
-
-    def forward(self, x: torch.Tensor):
-        for block in self.resblocks:
-            x = block(x)
-        return x
-
-
-class CrossAttentionDecoder(nn.Module):
-
-    def __init__(
-        self,
-        *,
-        out_channels: int,
-        fourier_embedder: FourierEmbedder,
-        width: int,
-        heads: int,
-        mlp_expand_ratio: int = 4,
-        downsample_ratio: int = 1,
-        enable_ln_post: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary"
-    ):
-        super().__init__()
-
-        self.enable_ln_post = enable_ln_post
-        self.fourier_embedder = fourier_embedder
-        self.downsample_ratio = downsample_ratio
-        self.query_proj = ops.Linear(self.fourier_embedder.out_dim, width)
-        if self.downsample_ratio != 1:
-            self.latents_proj = ops.Linear(width * downsample_ratio, width)
-        if self.enable_ln_post == False:
-            qk_norm = False
-        self.cross_attn_decoder = ResidualCrossAttentionBlock(
-            width=width,
-            mlp_expand_ratio=mlp_expand_ratio,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm
-        )
-
-        if self.enable_ln_post:
-            self.ln_post = ops.LayerNorm(width)
-        self.output_proj = ops.Linear(width, out_channels)
-        self.label_type = label_type
-        self.count = 0
-
-    def forward(self, queries=None, query_embeddings=None, latents=None):
-        if query_embeddings is None:
-            query_embeddings = self.query_proj(self.fourier_embedder(queries).to(latents.dtype))
-        self.count += query_embeddings.shape[1]
-        if self.downsample_ratio != 1:
-            latents = self.latents_proj(latents)
-        x = self.cross_attn_decoder(query_embeddings, latents)
-        if self.enable_ln_post:
-            x = self.ln_post(x)
-        occ = self.output_proj(x)
-        return occ
-
-
-class ShapeVAE(nn.Module):
-    def __init__(
-        self,
-        *,
-        embed_dim: int,
-        width: int,
-        heads: int,
-        num_decoder_layers: int,
-        geo_decoder_downsample_ratio: int = 1,
-        geo_decoder_mlp_expand_ratio: int = 4,
-        geo_decoder_ln_post: bool = True,
-        num_freqs: int = 8,
-        include_pi: bool = True,
-        qkv_bias: bool = True,
-        qk_norm: bool = False,
-        label_type: str = "binary",
-        drop_path_rate: float = 0.0,
-        scale_factor: float = 1.0,
-    ):
-        super().__init__()
-        self.geo_decoder_ln_post = geo_decoder_ln_post
-
-        self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
-
-        self.post_kl = ops.Linear(embed_dim, width)
-
-        self.transformer = Transformer(
-            width=width,
-            layers=num_decoder_layers,
-            heads=heads,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            drop_path_rate=drop_path_rate
-        )
-
-        self.geo_decoder = CrossAttentionDecoder(
-            fourier_embedder=self.fourier_embedder,
-            out_channels=1,
-            mlp_expand_ratio=geo_decoder_mlp_expand_ratio,
-            downsample_ratio=geo_decoder_downsample_ratio,
-            enable_ln_post=self.geo_decoder_ln_post,
-            width=width // geo_decoder_downsample_ratio,
-            heads=heads // geo_decoder_downsample_ratio,
-            qkv_bias=qkv_bias,
-            qk_norm=qk_norm,
-            label_type=label_type,
-        )
-
-        self.volume_decoder = VanillaVolumeDecoder()
-        self.scale_factor = scale_factor
-
-    def decode(self, latents, **kwargs):
-        latents = self.post_kl(latents.movedim(-2, -1))
-        latents = self.transformer(latents)
-
-        bounds = kwargs.get("bounds", 1.01)
-        num_chunks = kwargs.get("num_chunks", 8000)
-        octree_resolution = kwargs.get("octree_resolution", 256)
-        enable_pbar = kwargs.get("enable_pbar", True)
-
-        grid_logits = self.volume_decoder(latents, self.geo_decoder, bounds=bounds, num_chunks=num_chunks, octree_resolution=octree_resolution, enable_pbar=enable_pbar)
-        return grid_logits.movedim(-2, -1)
-
-    def encode(self, x):
-        return None
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -1,340 +0,0 @@
-#Based on Flux code because of weird hunyuan video code license.
-
-import torch
-import comfy.ldm.flux.layers
-import comfy.ldm.modules.diffusionmodules.mmdit
-from comfy.ldm.modules.attention import optimized_attention
-
-
-from dataclasses import dataclass
-from einops import repeat
-
-from torch import Tensor, nn
-
-from comfy.ldm.flux.layers import (
-    DoubleStreamBlock,
-    EmbedND,
-    LastLayer,
-    MLPEmbedder,
-    SingleStreamBlock,
-    timestep_embedding
-)
-
-import comfy.ldm.common_dit
-
-
-@dataclass
-class HunyuanVideoParams:
-    in_channels: int
-    out_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list
-    theta: int
-    patch_size: list
-    qkv_bias: bool
-    guidance_embed: bool
-
-
-class SelfAttentionRef(nn.Module):
-    def __init__(self, dim: int, qkv_bias: bool = False, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
-
-
-class TokenRefinerBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.heads = heads
-        mlp_hidden_dim = hidden_size * 4
-
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-        self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
-        self.self_attn = SelfAttentionRef(hidden_size, True, dtype=dtype, device=device, operations=operations)
-
-        self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=True, eps=1e-6, dtype=dtype, device=device)
-
-        self.mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.SiLU(),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
-
-    def forward(self, x, c, mask):
-        mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1)
-
-        norm_x = self.norm1(x)
-        qkv = self.self_attn.qkv(norm_x)
-        q, k, v = qkv.reshape(qkv.shape[0], qkv.shape[1], 3, self.heads, -1).permute(2, 0, 3, 1, 4)
-        attn = optimized_attention(q, k, v, self.heads, mask=mask, skip_reshape=True)
-
-        x = x + self.self_attn.proj(attn) * mod1.unsqueeze(1)
-        x = x + self.mlp(self.norm2(x)) * mod2.unsqueeze(1)
-        return x
-
-
-class IndividualTokenRefiner(nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads,
-        num_blocks,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-        self.blocks = nn.ModuleList(
-            [
-                TokenRefinerBlock(
-                    hidden_size=hidden_size,
-                    heads=heads,
-                    dtype=dtype,
-                    device=device,
-                    operations=operations
-                )
-                for _ in range(num_blocks)
-            ]
-        )
-
-    def forward(self, x, c, mask):
-        m = None
-        if mask is not None:
-            m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1)
-            m = m + m.transpose(2, 3)
-
-        for block in self.blocks:
-            x = block(x, c, m)
-        return x
-
-
-
-class TokenRefiner(nn.Module):
-    def __init__(
-        self,
-        text_dim,
-        hidden_size,
-        heads,
-        num_blocks,
-        dtype=None,
-        device=None,
-        operations=None
-    ):
-        super().__init__()
-
-        self.input_embedder = operations.Linear(text_dim, hidden_size, bias=True, dtype=dtype, device=device)
-        self.t_embedder = MLPEmbedder(256, hidden_size, dtype=dtype, device=device, operations=operations)
-        self.c_embedder = MLPEmbedder(text_dim, hidden_size, dtype=dtype, device=device, operations=operations)
-        self.individual_token_refiner = IndividualTokenRefiner(hidden_size, heads, num_blocks, dtype=dtype, device=device, operations=operations)
-
-    def forward(
-        self,
-        x,
-        timesteps,
-        mask,
-    ):
-        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
-        # m = mask.float().unsqueeze(-1)
-        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        c = x.sum(dim=1) / x.shape[1]
-
-        c = t + self.c_embedder(c.to(x.dtype))
-        x = self.input_embedder(x)
-        x = self.individual_token_refiner(x, c, mask)
-        return x
-
-class HunyuanVideo(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-
-    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
-        super().__init__()
-        self.dtype = dtype
-        params = HunyuanVideoParams(**kwargs)
-        self.params = params
-        self.patch_size = params.patch_size
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-
-        self.img_in = comfy.ldm.modules.diffusionmodules.mmdit.PatchEmbed(None, self.patch_size, self.in_channels, self.hidden_size, conv3d=True, dtype=dtype, device=device, operations=operations)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
-        )
-
-        self.txt_in = TokenRefiner(params.context_in_dim, self.hidden_size, self.num_heads, 2, dtype=dtype, device=device, operations=operations)
-
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    flipped_img_txt=True,
-                    dtype=dtype, device=device, operations=operations
-                )
-                for _ in range(params.depth)
-            ]
-        )
-
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-
-        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
-
-    def forward_orig(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        txt_mask: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor = None,
-        guiding_frame_index=None,
-        control=None,
-        transformer_options={},
-    ) -> Tensor:
-        patches_replace = transformer_options.get("patches_replace", {})
-
-        initial_shape = list(img.shape)
-        # running on sequences img
-        img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
-
-        if guiding_frame_index is not None:
-            token_replace_vec = self.time_in(timestep_embedding(guiding_frame_index, 256, time_factor=1.0))
-            vec_ = self.vector_in(y[:, :self.params.vec_in_dim])
-            vec = torch.cat([(vec_ + token_replace_vec).unsqueeze(1), (vec_ + vec).unsqueeze(1)], dim=1)
-            frame_tokens = (initial_shape[-1] // self.patch_size[-1]) * (initial_shape[-2] // self.patch_size[-2])
-            modulation_dims = [(0, frame_tokens, 0), (frame_tokens, None, 1)]
-            modulation_dims_txt = [(0, None, 1)]
-        else:
-            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
-            modulation_dims = None
-            modulation_dims_txt = None
-
-        if self.params.guidance_embed:
-            if guidance is not None:
-                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
-
-        if txt_mask is not None and not torch.is_floating_point(txt_mask):
-            txt_mask = (txt_mask - 1).to(img.dtype) * torch.finfo(img.dtype).max
-
-        txt = self.txt_in(txt, timesteps, txt_mask)
-
-        ids = torch.cat((img_ids, txt_ids), dim=1)
-        pe = self.pe_embedder(ids)
-
-        img_len = img.shape[1]
-        if txt_mask is not None:
-            attn_mask_len = img_len + txt.shape[1]
-            attn_mask = torch.zeros((1, 1, attn_mask_len), dtype=img.dtype, device=img.device)
-            attn_mask[:, 0, img_len:] = txt_mask
-        else:
-            attn_mask = None
-
-        blocks_replace = patches_replace.get("dit", {})
-        for i, block in enumerate(self.double_blocks):
-            if ("double_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"], out["txt"] = block(img=args["img"], txt=args["txt"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims_img=args["modulation_dims_img"], modulation_dims_txt=args["modulation_dims_txt"])
-                    return out
-
-                out = blocks_replace[("double_block", i)]({"img": img, "txt": txt, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims_img': modulation_dims, 'modulation_dims_txt': modulation_dims_txt}, {"original_block": block_wrap})
-                txt = out["txt"]
-                img = out["img"]
-            else:
-                img, txt = block(img=img, txt=txt, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims_img=modulation_dims, modulation_dims_txt=modulation_dims_txt)
-
-            if control is not None: # Controlnet
-                control_i = control.get("input")
-                if i < len(control_i):
-                    add = control_i[i]
-                    if add is not None:
-                        img += add
-
-        img = torch.cat((img, txt), 1)
-
-        for i, block in enumerate(self.single_blocks):
-            if ("single_block", i) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"], vec=args["vec"], pe=args["pe"], attn_mask=args["attention_mask"], modulation_dims=args["modulation_dims"])
-                    return out
-
-                out = blocks_replace[("single_block", i)]({"img": img, "vec": vec, "pe": pe, "attention_mask": attn_mask, 'modulation_dims': modulation_dims}, {"original_block": block_wrap})
-                img = out["img"]
-            else:
-                img = block(img, vec=vec, pe=pe, attn_mask=attn_mask, modulation_dims=modulation_dims)
-
-            if control is not None: # Controlnet
-                control_o = control.get("output")
-                if i < len(control_o):
-                    add = control_o[i]
-                    if add is not None:
-                        img[:, : img_len] += add
-
-        img = img[:, : img_len]
-
-        img = self.final_layer(img, vec, modulation_dims=modulation_dims)  # (N, T, patch_size ** 2 * out_channels)
-
-        shape = initial_shape[-3:]
-        for i in range(len(shape)):
-            shape[i] = shape[i] // self.patch_size[i]
-        img = img.reshape([img.shape[0]] + shape + [self.out_channels] + self.patch_size)
-        img = img.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        img = img.reshape(initial_shape[0], self.out_channels, initial_shape[2], initial_shape[3], initial_shape[4])
-        return img
-
-    def forward(self, x, timestep, context, y, guidance=None, attention_mask=None, guiding_frame_index=None, control=None, transformer_options={}, **kwargs):
-        bs, c, t, h, w = x.shape
-        patch_size = self.patch_size
-        t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
-        h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
-        w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
-        img_ids = torch.zeros((t_len, h_len, w_len, 3), device=x.device, dtype=x.dtype)
-        img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(0, t_len - 1, steps=t_len, device=x.device, dtype=x.dtype).reshape(-1, 1, 1)
-        img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).reshape(1, -1, 1)
-        img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).reshape(1, 1, -1)
-        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, guidance, guiding_frame_index, control, transformer_options)
-        return out
--- a/comfy/ldm/hydit/attn_layers.py
+++ b/comfy/ldm/hydit/attn_layers.py
@ -47,7 +47,7 @@ def reshape_for_broadcast(freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]], x


 def rotate_half(x):
-    x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+    x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
    return torch.stack([-x_imag, x_real], dim=-1).flatten(3)


@ -78,9 +78,10 @@ def apply_rotary_emb(
    xk_out = None
    if isinstance(freqs_cis, tuple):
        cos, sin = reshape_for_broadcast(freqs_cis, xq, head_first)    # [S, D]
-        xq_out = (xq * cos + rotate_half(xq) * sin)
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        xq_out = (xq.float() * cos + rotate_half(xq.float()) * sin).type_as(xq)
        if xk is not None:
-            xk_out = (xk * cos + rotate_half(xk) * sin)
+            xk_out = (xk.float() * cos + rotate_half(xk.float()) * sin).type_as(xk)
    else:
        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # [B, S, H, D//2]
        freqs_cis = reshape_for_broadcast(freqs_cis, xq_, head_first).to(xq.device)   # [S, D//2] --> [1, S, 1, D//2]
--- a/comfy/ldm/hydit/controlnet.py
+++ b/comfy/ldm/hydit/controlnet.py
@ -1,17 +1,24 @@
+from typing import Any, Optional

 import torch
 import torch.nn as nn
+import torch.nn.functional as F

+from torch.utils import checkpoint

 from comfy.ldm.modules.diffusionmodules.mmdit import (
+    Mlp,
    TimestepEmbedder,
    PatchEmbed,
+    RMSNorm,
 )
+from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
 from .poolers import AttentionPool

 import comfy.latent_formats
 from .models import HunYuanDiTBlock, calc_rope

+from .posemb_layers import get_2d_rotary_pos_embed, get_fill_resize_and_crop


 class HunYuanControlNet(nn.Module):
@ -164,6 +171,9 @@ class HunYuanControlNet(nn.Module):
            ),
        )

+        # Image embedding
+        num_patches = self.x_embedder.num_patches
+
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList(
            [
--- a/comfy/ldm/hydit/models.py
+++ b/comfy/ldm/hydit/models.py
@ -1,6 +1,8 @@
+from typing import Any

 import torch
 import torch.nn as nn
+import torch.nn.functional as F

 import comfy.ops
 from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, TimestepEmbedder, PatchEmbed, RMSNorm
@ -19,7 +21,6 @@ def calc_rope(x, patch_size, head_size):
    sub_args = [start, stop, (th, tw)]
    # head_size = HUNYUAN_DIT_CONFIG['DiT-g/2']['hidden_size'] // HUNYUAN_DIT_CONFIG['DiT-g/2']['num_heads']
    rope = get_2d_rotary_pos_embed(head_size, *sub_args)
-    rope = (rope[0].to(x), rope[1].to(x))
    return rope


@ -248,6 +249,9 @@ class HunYuanDiT(nn.Module):
            operations.Linear(hidden_size * 4, hidden_size, bias=True, dtype=dtype, device=device),
        )

+        # Image embedding
+        num_patches = self.x_embedder.num_patches
+
        # HUnYuanDiT Blocks
        self.blocks = nn.ModuleList([
            HunYuanDiTBlock(hidden_size=hidden_size,
@ -282,7 +286,7 @@ class HunYuanDiT(nn.Module):
                style=None,
                return_dict=False,
                control=None,
-                transformer_options={},
+                transformer_options=None,
                ):
        """
        Forward pass of the encoder.
@ -310,7 +314,8 @@ class HunYuanDiT(nn.Module):
        return_dict: bool
            Whether to return a dictionary.
        """
-        patches_replace = transformer_options.get("patches_replace", {})
+        #import pdb
+        #pdb.set_trace()
        encoder_hidden_states = context
        text_states = encoder_hidden_states                     # 2,77,1024
        text_states_t5 = encoder_hidden_states_t5               # 2,256,2048
@ -358,8 +363,6 @@ class HunYuanDiT(nn.Module):
        # Concatenate all extra vectors
        c = t + self.extra_embedder(extra_vec)  # [B, D]

-        blocks_replace = patches_replace.get("dit", {})
-
        controls = None
        if control:
            controls = control.get("output", None)
@ -368,23 +371,12 @@ class HunYuanDiT(nn.Module):
        for layer, block in enumerate(self.blocks):
            if layer > self.depth // 2:
                if controls is not None:
-                    skip = skips.pop() + controls.pop().to(dtype=x.dtype)
+                    skip = skips.pop() + controls.pop()
                else:
                    skip = skips.pop()
-            else:
-                skip = None
-
-            if ("double_block", layer) in blocks_replace:
-                def block_wrap(args):
-                    out = {}
-                    out["img"] = block(args["img"], args["vec"], args["txt"], args["pe"], args["skip"])
-                    return out
-
-                out = blocks_replace[("double_block", layer)]({"img": x, "txt": text_states, "vec": c, "pe": freqs_cis_img, "skip": skip}, {"original_block": block_wrap})
-                x = out["img"]
-            else:
                x = block(x, c, text_states, freqs_cis_img, skip)   # (N, L, D)
-
+            else:
+                x = block(x, c, text_states, freqs_cis_img)         # (N, L, D)

            if layer < (self.depth // 2 - 1):
                skips.append(x)
--- a/comfy/ldm/hydit/poolers.py
+++ b/comfy/ldm/hydit/poolers.py
@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from comfy.ldm.modules.attention import optimized_attention
 import comfy.ops

--- a/Show More
+++ b/Show More