ComfyUI/comfy/ldm/audio/embedders.py

# code adapted from: https://github.com/Stability-AI/stable-audio-tools

import torch
import torch.nn as nn
from torch import Tensor
from typing import List, Union
from einops import rearrange
import math
import comfy.ops

class LearnedPositionalEmbedding(nn.Module):
    """Used for continuous time"""

    def __init__(self, dim: int):
        super().__init__()
        assert (dim % 2) == 0
        half_dim = dim // 2
        self.weights = nn.Parameter(torch.empty(half_dim))

    def forward(self, x: Tensor) -> Tensor:
        x = rearrange(x, "b -> b 1")
        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * math.pi
        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
        fouriered = torch.cat((x, fouriered), dim=-1)
        return fouriered

def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
    return nn.Sequential(
        LearnedPositionalEmbedding(dim),
        comfy.ops.manual_cast.Linear(in_features=dim + 1, out_features=out_features),
    )


class NumberEmbedder(nn.Module):
    def __init__(
        self,
        features: int,
        dim: int = 256,
    ):
        super().__init__()
        self.features = features
        self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)

    def forward(self, x: Union[List[float], Tensor]) -> Tensor:
        if not torch.is_tensor(x):
            device = next(self.embedding.parameters()).device
            x = torch.tensor(x, device=device)
        assert isinstance(x, Tensor)
        shape = x.shape
        x = rearrange(x, "... -> (...)")
        embedding = self.embedding(x)
        x = embedding.view(*shape, self.features)
        return x  # type: ignore


class Conditioner(nn.Module):
    def __init__(
            self,
            dim: int,
            output_dim: int,
            project_out: bool = False
            ):

        super().__init__()

        self.dim = dim
        self.output_dim = output_dim
        self.proj_out = nn.Linear(dim, output_dim) if (dim != output_dim or project_out) else nn.Identity()

    def forward(self, x):
        raise NotImplementedError()

class NumberConditioner(Conditioner):
    '''
        Conditioner that takes a list of floats, normalizes them for a given range, and returns a list of embeddings
    '''
    def __init__(self,
                output_dim: int,
                min_val: float=0,
                max_val: float=1
                ):
        super().__init__(output_dim, output_dim)

        self.min_val = min_val
        self.max_val = max_val

        self.embedder = NumberEmbedder(features=output_dim)

    def forward(self, floats, device=None):
            # Cast the inputs to floats
            floats = [float(x) for x in floats]

            if device is None:
                device = next(self.embedder.parameters()).device

            floats = torch.tensor(floats).to(device)

            floats = floats.clamp(self.min_val, self.max_val)

            normalized_floats = (floats - self.min_val) / (self.max_val - self.min_val)

            # Cast floats to same type as embedder
            embedder_dtype = next(self.embedder.parameters()).dtype
            normalized_floats = normalized_floats.to(embedder_dtype)

            float_embeds = self.embedder(normalized_floats).unsqueeze(1)

            return [float_embeds, torch.ones(float_embeds.shape[0], 1).to(device)]
Initial support for the stable audio open model. 2024-06-15 16:14:56 +00:00			`# code adapted from: https://github.com/Stability-AI/stable-audio-tools`

			`import torch`
			`import torch.nn as nn`
Lint unused import (#5973) * Lint unused import * nit * Remove unused imports * revert fix_torch import * nit 2024-12-09 20:24:39 +00:00			`from torch import Tensor`
			`from typing import List, Union`
Initial support for the stable audio open model. 2024-06-15 16:14:56 +00:00			`from einops import rearrange`
			`import math`
			`import comfy.ops`

			`class LearnedPositionalEmbedding(nn.Module):`
			`"""Used for continuous time"""`

			`def __init__(self, dim: int):`
			`super().__init__()`
			`assert (dim % 2) == 0`
			`half_dim = dim // 2`
			`self.weights = nn.Parameter(torch.empty(half_dim))`

			`def forward(self, x: Tensor) -> Tensor:`
			`x = rearrange(x, "b -> b 1")`
			`freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * math.pi`
			`fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)`
			`fouriered = torch.cat((x, fouriered), dim=-1)`
			`return fouriered`

			`def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:`
			`return nn.Sequential(`
			`LearnedPositionalEmbedding(dim),`
			`comfy.ops.manual_cast.Linear(in_features=dim + 1, out_features=out_features),`
			`)`


			`class NumberEmbedder(nn.Module):`
			`def __init__(`
			`self,`
			`features: int,`
			`dim: int = 256,`
			`):`
			`super().__init__()`
			`self.features = features`
			`self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)`

			`def forward(self, x: Union[List[float], Tensor]) -> Tensor:`
			`if not torch.is_tensor(x):`
			`device = next(self.embedding.parameters()).device`
			`x = torch.tensor(x, device=device)`
			`assert isinstance(x, Tensor)`
			`shape = x.shape`
			`x = rearrange(x, "... -> (...)")`
			`embedding = self.embedding(x)`
			`x = embedding.view(*shape, self.features)`
			`return x # type: ignore`


			`class Conditioner(nn.Module):`
			`def __init__(`
			`self,`
			`dim: int,`
			`output_dim: int,`
			`project_out: bool = False`
			`):`

			`super().__init__()`

			`self.dim = dim`
			`self.output_dim = output_dim`
			`self.proj_out = nn.Linear(dim, output_dim) if (dim != output_dim or project_out) else nn.Identity()`

			`def forward(self, x):`
			`raise NotImplementedError()`

			`class NumberConditioner(Conditioner):`
			`'''`
			`Conditioner that takes a list of floats, normalizes them for a given range, and returns a list of embeddings`
			`'''`
			`def __init__(self,`
			`output_dim: int,`
			`min_val: float=0,`
			`max_val: float=1`
			`):`
			`super().__init__(output_dim, output_dim)`

			`self.min_val = min_val`
			`self.max_val = max_val`

			`self.embedder = NumberEmbedder(features=output_dim)`

			`def forward(self, floats, device=None):`
			`# Cast the inputs to floats`
			`floats = [float(x) for x in floats]`

			`if device is None:`
			`device = next(self.embedder.parameters()).device`

			`floats = torch.tensor(floats).to(device)`

			`floats = floats.clamp(self.min_val, self.max_val)`

			`normalized_floats = (floats - self.min_val) / (self.max_val - self.min_val)`

			`# Cast floats to same type as embedder`
			`embedder_dtype = next(self.embedder.parameters()).dtype`
			`normalized_floats = normalized_floats.to(embedder_dtype)`

			`float_embeds = self.embedder(normalized_floats).unsqueeze(1)`

			`return [float_embeds, torch.ones(float_embeds.shape[0], 1).to(device)]`