File size: 2,460 Bytes
635f007 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
from torch import Tensor, nn
from .utils import *
from .sampler import *
"""
Diffusion Classes (generic for 1d data)
"""
class Model1d(nn.Module):
def __init__(self, unet_type: str = "base", **kwargs):
super().__init__()
diffusion_kwargs, kwargs = groupby("diffusion_", kwargs)
self.unet = None
self.diffusion = None
def forward(self, x: Tensor, **kwargs) -> Tensor:
return self.diffusion(x, **kwargs)
def sample(self, *args, **kwargs) -> Tensor:
return self.diffusion.sample(*args, **kwargs)
"""
Audio Diffusion Classes (specific for 1d audio data)
"""
def get_default_model_kwargs():
return dict(
channels=128,
patch_size=16,
multipliers=[1, 2, 4, 4, 4, 4, 4],
factors=[4, 4, 4, 2, 2, 2],
num_blocks=[2, 2, 2, 2, 2, 2],
attentions=[0, 0, 0, 1, 1, 1, 1],
attention_heads=8,
attention_features=64,
attention_multiplier=2,
attention_use_rel_pos=False,
diffusion_type="v",
diffusion_sigma_distribution=UniformDistribution(),
)
def get_default_sampling_kwargs():
return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True)
class AudioDiffusionModel(Model1d):
def __init__(self, **kwargs):
super().__init__(**{**get_default_model_kwargs(), **kwargs})
def sample(self, *args, **kwargs):
return super().sample(*args, **{**get_default_sampling_kwargs(), **kwargs})
class AudioDiffusionConditional(Model1d):
def __init__(
self,
embedding_features: int,
embedding_max_length: int,
embedding_mask_proba: float = 0.1,
**kwargs,
):
self.embedding_mask_proba = embedding_mask_proba
default_kwargs = dict(
**get_default_model_kwargs(),
unet_type="cfg",
context_embedding_features=embedding_features,
context_embedding_max_length=embedding_max_length,
)
super().__init__(**{**default_kwargs, **kwargs})
def forward(self, *args, **kwargs):
default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba)
return super().forward(*args, **{**default_kwargs, **kwargs})
def sample(self, *args, **kwargs):
default_kwargs = dict(
**get_default_sampling_kwargs(),
embedding_scale=5.0,
)
return super().sample(*args, **{**default_kwargs, **kwargs})
|