A different way of handling multiple images passed to SVD.

Previously when a list of 3 images [0, 1, 2] was used for a 6 frame video they were concated like this: [0, 1, 2, 0, 1, 2] now they are concated like this: [0, 0, 1, 1, 2, 2]
2025-07-24 00:17:02 +08:00 · 2023-12-03 03:31:47 -05:00 · 2023-12-03 03:31:47 -05:00 · 61a123a1e0
commit 61a123a1e0
parent b2517b4ceb
2 changed files with 21 additions and 1 deletions
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -303,7 +303,7 @@ class SVD_img2vid(BaseModel):
        if latent_image.shape[1:] != noise.shape[1:]:
            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

-        latent_image = utils.repeat_to_batch_size(latent_image, noise.shape[0])
+        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)

--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -239,6 +239,26 @@ def repeat_to_batch_size(tensor, batch_size):
        return tensor.repeat([math.ceil(batch_size / tensor.shape[0])] + [1] * (len(tensor.shape) - 1))[:batch_size]
    return tensor

+def resize_to_batch_size(tensor, batch_size):
+    in_batch_size = tensor.shape[0]
+    if in_batch_size == batch_size:
+        return tensor
+
+    if batch_size <= 1:
+        return tensor[:batch_size]
+
+    output = torch.empty([batch_size] + list(tensor.shape)[1:], dtype=tensor.dtype, device=tensor.device)
+    if batch_size < in_batch_size:
+        scale = (in_batch_size - 1) / (batch_size - 1)
+        for i in range(batch_size):
+            output[i] = tensor[min(round(i * scale), in_batch_size - 1)]
+    else:
+        scale = in_batch_size / batch_size
+        for i in range(batch_size):
+            output[i] = tensor[min(math.floor((i + 0.5) * scale), in_batch_size - 1)]
+
+    return output
+
 def convert_sd_to(state_dict, dtype):
    keys = list(state_dict.keys())
    for k in keys: