Spaces:

RishabA
/

celeba-latent-diffusion

Running on Zero

File size: 62,870 Bytes

import math
import os
import random
import glob
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.optim import Adam
from torchvision.utils import make_grid
from PIL import Image
from transformers import (
    DistilBertModel,
    DistilBertTokenizer,
    CLIPTokenizer,
    CLIPTextModel,
)

dataset_params = {
    "image_path": "data/CelebAMask-HQ",
    "image_channels": 3,
    "image_size": 256,
    "name": "celebhq",
}

diffusion_params = {
    "num_timesteps": 1000,
    "beta_start": 0.00085,
    "beta_end": 0.012,
}

ldm_params = {
    "down_channels": [256, 384, 512, 768],
    "mid_channels": [768, 512],
    "down_sample": [True, True, True],
    "attn_down": [True, True, True],  # Attention in the DownBlock and UpBlock of VQ-VAE
    "time_emb_dim": 512,
    "norm_channels": 32,
    "num_heads": 16,
    "conv_out_channels": 128,
    "num_down_layers": 2,
    "num_mid_layers": 2,
    "num_up_layers": 2,
    "condition_config": {
        "condition_types": ["text", "image"],
        "text_condition_config": {
            "text_embed_model": "clip",
            "train_text_embed_model": False,
            "text_embed_dim": 512,  # Each token should map to text_embed_dim sized vector
            "cond_drop_prob": 0.1,  # Probability of dropping conditioning during training to allow the model to generate images without conditioning as well
        },
        "image_condition_config": {
            "image_condition_input_channels": 18,  # CelebA has 18 classes excluding background
            "image_condition_output_channels": 3,
            "image_condition_h": 512,  # Mask height
            "image_condition_w": 512,  # Mask width
            "cond_drop_prob": 0.1,  # Probability of dropping conditioning during training to allow the model to generate images without conditioning as well
        },
    },
}

autoencoder_params = {
    "z_channels": 4,
    "codebook_size": 8192,
    "down_channels": [64, 128, 256, 256],
    "mid_channels": [256, 256],
    "down_sample": [True, True, True],
    "attn_down": [
        False,
        False,
        False,
    ],  # No attention in the DownBlock and UpBlock of VQ-VAE
    "norm_channels": 32,
    "num_heads": 4,
    "num_down_layers": 2,
    "num_mid_layers": 2,
    "num_up_layers": 2,
}

train_params = {
    "seed": 1111,
    "task_name": "celebhq",  # Folder to save models and images to
    "ldm_batch_size": 16,
    "autoencoder_batch_size": 4,
    "disc_start": 15000,
    "disc_weight": 0.5,
    "codebook_weight": 1,
    "commitment_beta": 0.2,
    "perceptual_weight": 1,
    "kl_weight": 0.000005,
    "ldm_epochs": 100,
    "autoencoder_epochs": 20,
    "num_samples": 1,
    "num_grid_rows": 1,
    "ldm_lr": 0.000005,
    "autoencoder_lr": 0.00001,
    "autoencoder_acc_steps": 4,
    "autoencoder_img_save_steps": 64,
    "save_latents": True,
    "cf_guidance_scale": 1.0,
    "vqvae_latent_dir_name": "vqvae_latents",
    "ldm_ckpt_name": "ddpm_ckpt_class_cond.pth",
    "vqvae_autoencoder_ckpt_name": "vqvae_autoencoder_ckpt.pth",
}


def get_config_value(config, key, default_value):
    return config[key] if key in config else default_value


def spatial_average(in_tens, keepdim=True):
    return in_tens.mean([2, 3], keepdim=keepdim)


class LinearNoiseScheduler:
    def __init__(self, num_timesteps, beta_start, beta_end):
        self.num_timesteps = num_timesteps
        self.beta_start = beta_start
        self.beta_end = beta_end
        self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_timesteps) ** 2
        self.alphas = 1.0 - self.betas
        self.alpha_cum_prod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alpha_cum_prod = torch.sqrt(self.alpha_cum_prod)
        self.sqrt_one_minus_alpha_cum_prod = torch.sqrt(1 - self.alpha_cum_prod)

    def add_noise(self, original, noise, t):
        # original: (batch_size, c, h, w), t: tensor of timesteps (batch_size,)
        batch_size = original.shape[0]
        sqrt_alpha_cum_prod = self.sqrt_alpha_cum_prod.to(original.device)[t].view(
            batch_size, 1, 1, 1
        )
        sqrt_one_minus_alpha_cum_prod = self.sqrt_one_minus_alpha_cum_prod.to(
            original.device
        )[t].view(batch_size, 1, 1, 1)
        return sqrt_alpha_cum_prod * original + sqrt_one_minus_alpha_cum_prod * noise

    def sample_prev_timestep(self, xt, noise_pred, t):
        batch_size = xt.shape[0]
        alpha_cum_prod_t = self.alpha_cum_prod.to(xt.device)[t].view(
            batch_size, 1, 1, 1
        )
        sqrt_one_minus_alpha_cum_prod_t = self.sqrt_one_minus_alpha_cum_prod.to(
            xt.device
        )[t].view(batch_size, 1, 1, 1)
        x0 = (xt - sqrt_one_minus_alpha_cum_prod_t * noise_pred) / torch.sqrt(
            alpha_cum_prod_t
        )
        x0 = torch.clamp(x0, -1.0, 1.0)
        betas_t = self.betas.to(xt.device)[t].view(batch_size, 1, 1, 1)
        mean = (
            xt - betas_t / sqrt_one_minus_alpha_cum_prod_t * noise_pred
        ) / torch.sqrt(self.alphas.to(xt.device)[t].view(batch_size, 1, 1, 1))
        if t[0] == 0:
            return mean, x0
        else:
            prev_alpha_cum_prod = self.alpha_cum_prod.to(xt.device)[
                (t - 1).clamp(min=0)
            ].view(batch_size, 1, 1, 1)
            variance = (1 - prev_alpha_cum_prod) / (1 - alpha_cum_prod_t) * betas_t
            sigma = variance.sqrt()
            z = torch.randn_like(xt)
            return mean + sigma * z, x0


def get_tokenizer_and_model(model_type, device, eval_mode=True):
    assert model_type in (
        "bert",
        "clip",
    ), "Text model can only be one of 'clip' or 'bert'"
    if model_type == "bert":
        text_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
        text_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(
            device
        )
    else:
        text_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")
        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch16").to(
            device
        )
    if eval_mode:
        text_model.eval()
    return text_tokenizer, text_model


def get_text_representation(text, text_tokenizer, text_model, device, max_length=77):
    token_output = text_tokenizer(
        text,
        truncation=True,
        padding="max_length",
        return_attention_mask=True,
        max_length=max_length,
    )
    tokens_tensor = torch.tensor(token_output["input_ids"]).to(device)
    mask_tensor = torch.tensor(token_output["attention_mask"]).to(device)
    text_embed = text_model(tokens_tensor, attention_mask=mask_tensor).last_hidden_state
    return text_embed


def get_time_embedding(time_steps, temb_dim):
    """
    Convert time steps tensor into an embedding using the sinusoidal time embedding formula
    time_steps: 1D tensor of length batch size
    temb_dim: Dimension of the embedding
    """
    assert temb_dim % 2 == 0, "time embedding dimension must be divisible by 2"

    # factor = 10000^(2i/d_model)
    factor = 10000 ** (
        (
            torch.arange(
                start=0,
                end=temb_dim // 2,
                dtype=torch.float32,
                device=time_steps.device,
            )
            / (temb_dim // 2)
        )
    )

    t_emb = time_steps.unsqueeze(dim=-1).repeat(1, temb_dim // 2) / factor
    t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)

    return t_emb  # (batch_size, temb_dim)


class DownBlock(nn.Module):
    """
    Down conv block with attention.
    1. Resnet block with time embedding
    2. Attention block
    3. Downsample

    in_channels: Number of channels in the input feature map.
    out_channels: Number of channels produced by this block.
    t_emb_dim: Dimension of the time embedding. Only use for UNet for Diffusion. In an AutoEncoder, set it to None.
    down_sample: Whether to apply downsampling at the end.
    num_heads: Number of attention heads (used if attention is enabled).
    num_layers: How many sub-blocks to apply in sequence.
    attn: Whether to apply self-attention
    norm_channels: Number of groups for GroupNorm.
    cross_attn: Whether to apply cross-attention.
    context_dim: If performing cross-attention, provide a context_dim for extra conditioning context.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        t_emb_dim,
        down_sample,
        num_heads,
        num_layers,
        attn,
        norm_channels,
        cross_attn=False,
        context_dim=None,
    ):
        super().__init__()

        self.num_layers = num_layers
        self.down_sample = down_sample
        self.attn = attn
        self.context_dim = context_dim
        self.cross_attn = cross_attn
        self.t_emb_dim = t_emb_dim

        self.resnet_conv_first = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(
                        norm_channels, in_channels if i == 0 else out_channels
                    ),  # Normalizes over channels. For the first sub-block, the in_channels=in_channels, else out_channels
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=(in_channels if i == 0 else out_channels),
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, c, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers)
            ]
        )

        # Only add the time embedding for diffusion and not AutoEncoder
        if self.t_emb_dim is not None:
            self.t_emb_layers = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.SiLU(),
                        nn.Linear(
                            in_features=self.t_emb_dim, out_features=out_channels
                        ),  # (batch_size, t_emb_dim) -> (batch_size, out_channels)
                    )
                    for i in range(num_layers)
                ]
            )

        self.resnet_conv_second = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(norm_channels, out_channels),
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=out_channels,
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, out_channels, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers)
            ]
        )

        self.residual_input_conv = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=(in_channels if i == 0 else out_channels),
                    out_channels=out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                )  # (batch_size, in_channels, h, w) -> (batch_size, out_channels, h, w)
                for i in range(num_layers)
            ]
        )

        if self.attn:
            self.attention_norms = nn.ModuleList(
                [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
            )

            self.attentions = nn.ModuleList(
                [
                    nn.MultiheadAttention(
                        embed_dim=out_channels, num_heads=num_heads, batch_first=True
                    )
                    for i in range(num_layers)
                ]
            )

        # Cross attention for text conditioning
        if self.cross_attn:
            assert (
                context_dim is not None
            ), "Context Dimension must be passed for cross attention"

            self.cross_attention_norms = nn.ModuleList(
                [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
            )

            self.cross_attentions = nn.ModuleList(
                [
                    nn.MultiheadAttention(
                        embed_dim=out_channels, num_heads=num_heads, batch_first=True
                    )
                    for i in range(num_layers)
                ]
            )

            self.context_proj = nn.ModuleList(
                [
                    nn.Linear(in_features=context_dim, out_features=out_channels)
                    for i in range(num_layers)
                ]
            )

        # Down sample by a factor of 2
        self.down_sample_conv = (
            nn.Conv2d(
                in_channels=out_channels,
                out_channels=out_channels,
                kernel_size=4,
                stride=2,
                padding=1,
            )
            if self.down_sample
            else nn.Identity()
        )  # (batch_size, out_channels, h / 2, w / 2)

    def forward(self, x, t_emb=None, context=None):
        out = x
        for i in range(self.num_layers):
            # Resnet block of UNET
            resnet_input = out  # (batch_size, c, h, w)

            out = self.resnet_conv_first[i](out)  # (batch_size, out_channels, h, w)

            # Only add the time embedding for diffusion and not AutoEncoder
            if self.t_emb_dim is not None:
                # Add the embeddings for timesteps - (batch_size, t_emb_dim) -> (batch_size, out_channels, 1, 1)
                out = out + self.t_emb_layers[i](t_emb).unsqueeze(dim=-1).unsqueeze(
                    dim=-1
                )  # (batch_size, out_channels, h, w)

            out = self.resnet_conv_second[i](
                out
            )  # (batch_size, out_channels, h, w) -> (batch_size, out_channels, h, w)

            # Residual Connection
            out = out + self.residual_input_conv[i](
                resnet_input
            )  # (batch_size, out_channels, h, w)

            # Only do for Diffusion and not for AutoEncoder
            if self.attn:
                # Attention block of UNET
                batch_size, channels, h, w = (
                    out.shape
                )  # (batch_size, out_channels, h, w)

                in_attn = out.reshape(
                    batch_size, channels, h * w
                )  # (batch_size, out_channels, h * w)
                in_attn = self.attention_norms[i](in_attn)
                in_attn = in_attn.transpose(1, 2)  # (batch_size, h * w, out_channels)

                # Self-Attention
                out_attn, attn_weights = self.attentions[i](in_attn, in_attn, in_attn)
                out_attn = out_attn.transpose(1, 2).reshape(
                    batch_size, channels, h, w
                )  # (batch_size, out_channels h, w)

                # Skip connection
                out = out + out_attn  # (batch_size, out_channels h, w)

            if self.cross_attn:
                assert (
                    context is not None
                ), "context cannot be None if cross attention layers are used"

                batch_size, channels, h, w = (
                    out.shape
                )  # (batch_size, out_channels, h, w)

                in_attn = out.reshape(
                    batch_size, channels, h * w
                )  # (batch_size, out_channels, h * w)
                in_attn = self.cross_attention_norms[i](in_attn)
                in_attn = in_attn.transpose(1, 2)  # (batch_size, h * w, out_channels)

                assert (
                    context.shape[0] == x.shape[0]
                    and context.shape[-1] == self.context_dim
                )  # Make sure the batch_size and context_dim match with the model's parameters
                context_proj = self.context_proj[i](
                    context
                )  # (batch_size, seq_len, context_dim) -> (batch_size, seq_len, out_channels)

                # Cross-Attention
                out_attn, attn_weights = self.cross_attentions[i](
                    in_attn, context_proj, context_proj
                )  # (batch_size, h * w, out_channels)
                out_attn = out_attn.transpose(1, 2).reshape(
                    batch_size, channels, h, w
                )  # (batch_size, out_channels, h, w)

                # Skip Connection
                out = out + out_attn  # (batch_size, out_channels, h, w)

        # Downsampling
        out = self.down_sample_conv(out)  # (batch_size, out_channels, h / 2, w / 2)
        return out


class MidBlock(nn.Module):
    """
    Mid conv block with attention.
    1. Resnet block with time embedding
    2. Attention block
    3. Resnet block with time embedding

    in_channels: Number of channels in the input feature map.
    out_channels: Number of channels produced by this block.
    t_emb_dim: Dimension of the time embedding. Only use for UNet for Diffusion. In an AutoEncoder, set it to None.
    num_heads: Number of attention heads (used if attention is enabled).
    num_layers: How many sub-blocks to apply in sequence.
    norm_channels: Number of groups for GroupNorm.
    cross_attn: Whether to apply cross-attention.
    context_dim: If performing cross-attention, provide a context_dim for extra conditioning context.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        t_emb_dim,
        num_heads,
        num_layers,
        norm_channels,
        cross_attn=None,
        context_dim=None,
    ):
        super().__init__()

        self.num_layers = num_layers
        self.t_emb_dim = t_emb_dim
        self.context_dim = context_dim
        self.cross_attn = cross_attn

        self.resnet_conv_first = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(
                        norm_channels, in_channels if i == 0 else out_channels
                    ),  # Normalizes over channels. For the first sub-block, the in_channels=in_channels, else out_channels
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=(in_channels if i == 0 else out_channels),
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, c, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers + 1)
            ]
        )

        # Only add the time embedding for diffusion and not AutoEncoder
        if self.t_emb_dim is not None:
            self.t_emb_layers = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.SiLU(),
                        nn.Linear(
                            in_features=self.t_emb_dim, out_features=out_channels
                        ),  # (batch_size, t_emb_dim) -> (batch_size, out_channels)
                    )
                    for i in range(num_layers + 1)
                ]
            )

        self.resnet_conv_second = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(norm_channels, out_channels),
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=out_channels,
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, out_channels, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers + 1)
            ]
        )

        self.residual_input_conv = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=(in_channels if i == 0 else out_channels),
                    out_channels=out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                )  # (batch_size, in_channels, h, w) -> (batch_size, out_channels, h, w)
                for i in range(num_layers + 1)
            ]
        )

        self.attention_norms = nn.ModuleList(
            [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
        )

        self.attentions = nn.ModuleList(
            [
                nn.MultiheadAttention(
                    embed_dim=out_channels, num_heads=num_heads, batch_first=True
                )
                for i in range(num_layers)
            ]
        )

        # Cross attention for text conditioning
        if self.cross_attn:
            assert (
                context_dim is not None
            ), "Context Dimension must be passed for cross attention"

            self.cross_attention_norms = nn.ModuleList(
                [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
            )

            self.cross_attentions = nn.ModuleList(
                [
                    nn.MultiheadAttention(
                        embed_dim=out_channels, num_heads=num_heads, batch_first=True
                    )
                    for i in range(num_layers)
                ]
            )

            self.context_proj = nn.ModuleList(
                [
                    nn.Linear(in_features=context_dim, out_features=out_channels)
                    for i in range(num_layers)
                ]
            )

    def forward(self, x, t_emb=None, context=None):
        out = x

        # First ResNet block
        resnet_input = out  # (batch_size, c, h, w)
        out = self.resnet_conv_first[0](out)  # (batch_size, out_channels, h, w)

        # Only add the time embedding for diffusion and not AutoEncoder
        if self.t_emb_dim is not None:
            # Add the embeddings for timesteps - (batch_size, t_emb_dim) -> (batch_size, out_channels, 1, 1)
            out = out + self.t_emb_layers[0](t_emb).unsqueeze(dim=-1).unsqueeze(
                dim=-1
            )  # (batch_size, out_channels, h, w)

        out = self.resnet_conv_second[0](
            out
        )  # (batch_size, out_channels, h, w) -> (batch_size, out_channels, h, w)

        # Residual Connection
        out = out + self.residual_input_conv[0](
            resnet_input
        )  # (batch_size, out_channels, h, w)

        for i in range(self.num_layers):
            # Attention Block
            batch_size, channels, h, w = out.shape  # (batch_size, out_channels, h, w)

            # Do for both Diffusion and AutoEncoder
            in_attn = out.reshape(
                batch_size, channels, h * w
            )  # (batch_size, out_channels, h * w)
            in_attn = self.attention_norms[i](in_attn)
            in_attn = in_attn.transpose(1, 2)  # (batch_size, h * w, out_channels)

            # Self-Attention
            out_attn, attn_weights = self.attentions[i](in_attn, in_attn, in_attn)
            out_attn = out_attn.transpose(1, 2).reshape(batch_size, channels, h, w)

            # Skip connection
            out = out + out_attn  # (batch_size, out_channels h, w)

            if self.cross_attn:
                assert (
                    context is not None
                ), "context cannot be None if cross attention layers are used"
                batch_size, channels, h, w = out.shape

                in_attn = out.reshape(
                    batch_size, channels, h * w
                )  # (batch_size, out_channels, h * w)
                in_attn = self.cross_attention_norms[i](in_attn)
                in_attn = in_attn.transpose(1, 2)  # (batch_size, h * w, out_channels)

                assert (
                    context.shape[0] == x.shape[0]
                    and context.shape[-1] == self.context_dim
                )  # Make sure the batch_size and context_dim match with the model's parameters
                context_proj = self.context_proj[i](
                    context
                )  # (batch_size, seq_len, context_dim) -> (batch_size, seq_len, context_dim)

                # Cross-Attention
                out_attn, attn_weights = self.cross_attentions[i](
                    in_attn, context_proj, context_proj
                )
                out_attn = out_attn.transpose(1, 2).reshape(
                    batch_size, channels, h, w
                )  # (batch_size, out_channels, h, w)

                # Skip Connection
                out = out + out_attn  # (batch_size, out_channels h, w)

            # Resnet Block
            resnet_input = out
            out = self.resnet_conv_first[i + 1](
                out
            )  # (batch_size, out_channels h, w) -> (batch_size, out_channels h, w)

            # Only add the time embedding for diffusion and not AutoEncoder
            if self.t_emb_dim is not None:
                # Add the embeddings for timesteps - (batch_size, t_emb_dim) -> (batch_size, out_channels, 1, 1)
                out = out + self.t_emb_layers[i + 1](t_emb).unsqueeze(dim=-1).unsqueeze(
                    dim=-1
                )  # (batch_size, out_channels h, w)

            out = self.resnet_conv_second[i + 1](
                out
            )  # (batch_size, out_channels h, w) -> (batch_size, out_channels h, w)

            # Residual Connection
            out = out + self.residual_input_conv[i + 1](
                resnet_input
            )  # (batch_size, out_channels, h, w)

        return out


class UpBlock(nn.Module):
    """
    Up conv block with attention.
    1. Upsample
    1. Concatenate Down block output
    2. Resnet block with time embedding
    3. Attention Block

    in_channels: Number of channels in the input feature map.
    out_channels: Number of channels produced by this block.
    t_emb_dim: Dimension of the time embedding. Only use for UNet for Diffusion. In an AutoEncoder, set it to None.
    up_sample: Whether to apply upsampling at the end.
    num_heads: Number of attention heads (used if attention is enabled).
    num_layers: How many sub-blocks to apply in sequence.
    attn: Whether to apply self-attention
    norm_channels: Number of groups for GroupNorm.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        t_emb_dim,
        up_sample,
        num_heads,
        num_layers,
        attn,
        norm_channels,
    ):
        super().__init__()

        self.num_layers = num_layers
        self.up_sample = up_sample
        self.t_emb_dim = t_emb_dim
        self.attn = attn

        # Upsample by a factor of 2
        self.up_sample_conv = (
            nn.ConvTranspose2d(
                in_channels=in_channels,
                out_channels=in_channels,
                kernel_size=4,
                stride=2,
                padding=1,
            )
            if self.up_sample
            else nn.Identity()
        )  # (batch_size, c, h * 2, w * 2)

        self.resnet_conv_first = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(
                        norm_channels, in_channels if i == 0 else out_channels
                    ),  # Normalizes over channels. For the first sub-block, the in_channels=in_channels, else out_channels
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=(in_channels if i == 0 else out_channels),
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, c, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers)
            ]
        )

        # Only add the time embedding for diffusion and not AutoEncoder
        if self.t_emb_dim is not None:
            self.t_emb_layers = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.SiLU(),
                        nn.Linear(
                            in_features=self.t_emb_dim, out_features=out_channels
                        ),  # (batch_size, t_emb_dim) -> (batch_size, out_channels)
                    )
                    for i in range(num_layers)
                ]
            )

        self.resnet_conv_second = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(norm_channels, out_channels),
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=out_channels,
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, out_channels, h, w) -> (batch_size, out_channels, h, w)
                )
                for i in range(num_layers)
            ]
        )

        self.residual_input_conv = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=(in_channels if i == 0 else out_channels),
                    out_channels=out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                )  # (batch_size, in_channels, h, w) -> (batch_size, out_channels, h, w)
                for i in range(num_layers)
            ]
        )

        if self.attn:
            self.attention_norms = nn.ModuleList(
                [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
            )

            self.attentions = nn.ModuleList(
                [
                    nn.MultiheadAttention(
                        embed_dim=out_channels, num_heads=num_heads, batch_first=True
                    )
                    for i in range(num_layers)
                ]
            )

    def forward(self, x, out_down=None, t_emb=None):
        # x shape: (batch_size, c, h, w)

        # Upsample
        x = self.up_sample_conv(
            x
        )  # (batch_size, c, h, w) -> (batch_size, c, h * 2, w * 2)

        # *Only do for diffusion
        # Concatenate with the output of respective DownBlock
        if out_down is not None:
            x = torch.cat(
                [x, out_down], dim=1
            )  # (batch_size, c, h * 2, w * 2) -> (batch_size, c * 2, h * 2, w * 2)

        out = x  # (batch_size, c, h * 2, w * 2)

        for i in range(self.num_layers):
            # Resnet block
            resnet_input = out
            out = self.resnet_conv_first[i](
                out
            )  # (batch_size, in_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)

            # Only add the time embedding for diffusion and not AutoEncoder
            if self.t_emb_dim is not None:
                # Add the embeddings for timesteps - (batch_size, t_emb_dim) -> (batch_size, out_channels, 1, 1)
                out = out + self.t_emb_layers[i](t_emb).unsqueeze(dim=-1).unsqueeze(
                    dim=-1
                )  # (batch_size, out_channels, h * 2, w * 2)

            out = self.resnet_conv_second[i](
                out
            )  # (batch_size, out_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)

            # Residual Connection
            out = out + self.residual_input_conv[i](
                resnet_input
            )  # (batch_size, out_channels, h * 2, w * 2)

            # Only do for Diffusion and not for AutoEncoder
            if self.attn:
                # Attention block of UNET
                batch_size, channels, h, w = out.shape

                in_attn = out.reshape(
                    batch_size, channels, h * w
                )  # (batch_size, out_channels, h * w * 4)
                in_attn = self.attention_norms[i](in_attn)
                in_attn = in_attn.transpose(
                    1, 2
                )  # (batch_size, h * w * 4, out_channels)

                # Self-Attention
                out_attn, attn_weights = self.attentions[i](in_attn, in_attn, in_attn)
                out_attn = out_attn.transpose(1, 2).reshape(
                    batch_size, channels, h, w
                )  # (batch_size, out_channels h * 2, w * 2)

                # Skip connection
                out = out + out_attn  # (batch_size, out_channels h * 2, w * 2)

        return out  # (batch_size, out_channels h * 2, w * 2)


class UpBlockUNet(nn.Module):
    """
    Up conv block with attention.
    1. Upsample
    1. Concatenate Down block output
    2. Resnet block with time embedding
    3. Attention Block

    in_channels: Number of channels in the input feature map. (It is passed in multiplied by 2 for concatenation with DownBlock output)
    out_channels: Number of channels produced by this block.
    t_emb_dim: Dimension of the time embedding. Only use for UNet for Diffusion. In an AutoEncoder, set it to None.
    up_sample: Whether to apply upsampling at the end.
    num_heads: Number of attention heads (used if attention is enabled).
    num_layers: How many sub-blocks to apply in sequence.
    norm_channels: Number of groups for GroupNorm.
    cross_attn: Whether to apply cross-attention.
    context_dim: If performing cross-attention, provide a context_dim for extra conditioning context.
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        t_emb_dim,
        up_sample,
        num_heads,
        num_layers,
        norm_channels,
        cross_attn=False,
        context_dim=None,
    ):
        super().__init__()

        self.num_layers = num_layers
        self.up_sample = up_sample
        self.t_emb_dim = t_emb_dim
        self.cross_attn = cross_attn
        self.context_dim = context_dim

        self.up_sample_conv = (
            nn.ConvTranspose2d(
                in_channels=(in_channels // 2),
                out_channels=(in_channels // 2),
                kernel_size=4,
                stride=2,
                padding=1,
            )
            if self.up_sample
            else nn.Identity()
        )  # (batch_size, in_channels // 2, h * 2, w * 2)

        self.resnet_conv_first = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(
                        norm_channels, in_channels if i == 0 else out_channels
                    ),  # Normalizes over channels. For the first sub-block, the in_channels=in_channels, else out_channels
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=(in_channels if i == 0 else out_channels),
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, in_channels, h * 2, w. * 2) -> (batch_size, out_channels, h * 2, w * 2) - Starts at in_channels and not in_channels // 2 because of concatenation
                )
                for i in range(num_layers)
            ]
        )

        # Only add the time embedding if needed for UNET in diffusion
        # Do not add the time embedding in the AutoEncoder
        if self.t_emb_dim is not None:
            self.t_emb_layers = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.SiLU(),
                        nn.Linear(
                            in_features=self.t_emb_dim, out_features=out_channels
                        ),  # (batch_size, t_emb_dim) -> (batch_size, out_channels)
                    )
                    for i in range(num_layers)
                ]
            )

        self.resnet_conv_second = nn.ModuleList(
            [
                nn.Sequential(
                    nn.GroupNorm(norm_channels, out_channels),
                    nn.SiLU(),
                    nn.Conv2d(
                        in_channels=out_channels,
                        out_channels=out_channels,
                        kernel_size=3,
                        stride=1,
                        padding=1,
                    ),  # (batch_size, out_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)
                )
                for i in range(num_layers)
            ]
        )

        self.residual_input_conv = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=(in_channels if i == 0 else out_channels),
                    out_channels=out_channels,
                    kernel_size=1,
                    stride=1,
                    padding=0,
                )
                for i in range(
                    num_layers
                )  # (batch_size, in_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)
            ]
        )

        self.attention_norms = nn.ModuleList(
            [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
        )

        self.attentions = nn.ModuleList(
            [
                nn.MultiheadAttention(
                    embed_dim=out_channels, num_heads=num_heads, batch_first=True
                )
                for i in range(num_layers)
            ]
        )

        # Cross attention for text conditioning
        if self.cross_attn:
            assert (
                context_dim is not None
            ), "Context Dimension must be passed for cross attention"

            self.cross_attention_norms = nn.ModuleList(
                [nn.GroupNorm(norm_channels, out_channels) for i in range(num_layers)]
            )

            self.cross_attentions = nn.ModuleList(
                [
                    nn.MultiheadAttention(
                        embed_dim=out_channels, num_heads=num_heads, batch_first=True
                    )
                    for i in range(num_layers)
                ]
            )

            self.context_proj = nn.ModuleList(
                [
                    nn.Linear(in_features=context_dim, out_features=out_channels)
                    for i in range(num_layers)
                ]
            )

    def forward(self, x, out_down=None, t_emb=None, context=None):
        # x shape: (batch_size, in_channels // 2, h, w)

        # Upsample
        x = self.up_sample_conv(
            x
        )  # (batch_size, in_channels // 2, h, w) -> (batch_size, in_channels // 2, h * 2, w * 2)

        # Concatenate with the output of respective DownBlock
        if out_down is not None:
            x = torch.cat(
                [x, out_down], dim=1
            )  # (batch_size, in_channels // 2, h * 2, w * 2) -> (batch_size, in_channels, h * 2, w * 2)

        out = x  # (batch_size, in_channels, h * 2, w * 2)
        for i in range(self.num_layers):
            # Resnet block
            resnet_input = out

            out = self.resnet_conv_first[i](
                out
            )  # (batch_size, in_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)

            if self.t_emb_dim is not None:
                # Add the embeddings for timesteps - (batch_size, t_emb_dim) -> (batch_size, out_channels, 1, 1)
                out = out + self.t_emb_layers[i](t_emb).unsqueeze(dim=-1).unsqueeze(
                    dim=-1
                )  # (batch_size, out_channels, h * 2, w * 2)

            out = self.resnet_conv_second[i](
                out
            )  # (batch_size, out_channels, h * 2, w * 2) -> (batch_size, out_channels, h * 2, w * 2)

            # Residual Connection
            out = out + self.residual_input_conv[i](
                resnet_input
            )  # (batch_size, out_channels, h * 2, w * 2)

            # Attention block of UNET
            batch_size, channels, h, w = (
                out.shape
            )  # (batch_size, out_channels, h * 2, w * 2)

            in_attn = out.reshape(
                batch_size, channels, h * w
            )  # (batch_size, out_channels, h * w * 4)
            in_attn = self.attention_norms[i](in_attn)
            in_attn = in_attn.transpose(1, 2)  # (batch_size, h * w * 4, out_channels)

            # Self-Attention
            out_attn, attn_weights = self.attentions[i](in_attn, in_attn, in_attn)
            out_attn = out_attn.transpose(1, 2).reshape(
                batch_size, channels, h, w
            )  # (batch_size, out_channels h * 2, w * 2)

            # Skip connection
            out = out + out_attn  # (batch_size, out_channels h * 2, w * 2)

            if self.cross_attn:
                assert (
                    context is not None
                ), "context cannot be None if cross attention layers are used"
                batch_size, channels, h, w = out.shape

                in_attn = out.reshape(
                    batch_size, channels, h * w
                )  # (batch_size, out_channels, h * w * 4)
                in_attn = self.cross_attention_norms[i](in_attn)
                in_attn = in_attn.transpose(
                    1, 2
                )  # (batch_size, h * w * 4, out_channels)

                assert (
                    len(context.shape) == 3
                ), "Context shape does not match batch_size, _, context_dim"

                assert (
                    context.shape[0] == x.shape[0]
                    and context.shape[-1] == self.context_dim
                ), "Context shape does not match batch_size, _, context_dim"  # Make sure the batch_size and context_dim match with the model's parameters
                context_proj = self.context_proj[i](
                    context
                )  # (batch_size, seq_len, context_dim) -> (batch_size, seq_len, context_dim)

                # Cross-Attention
                out_attn, attn_weights = self.cross_attentions[i](
                    in_attn, context_proj, context_proj
                )
                out_attn = out_attn.transpose(1, 2).reshape(
                    batch_size, channels, h, w
                )  # (batch_size, out_channels, h * 2, w * 2)

                # Skip Connection
                out = out + out_attn  # (batch_size, out_channels h * 2, w * 2)

        return out  # (batch_size, out_channels h * 2, w * 2)


class VQVAE(nn.Module):
    def __init__(self, image_channels, model_config):
        super().__init__()

        self.down_channels = model_config["down_channels"]
        self.mid_channels = model_config["mid_channels"]
        self.down_sample = model_config["down_sample"]
        self.num_down_layers = model_config["num_down_layers"]
        self.num_mid_layers = model_config["num_mid_layers"]
        self.num_up_layers = model_config["num_up_layers"]

        # To disable attention in Downblock of Encoder and Upblock of Decoder
        self.attns = model_config["attn_down"]

        # Latent Dimension
        self.z_channels = model_config[
            "z_channels"
        ]  # number of channels in the latent representation
        self.codebook_size = model_config[
            "codebook_size"
        ]  # number of discrete code vectors available
        self.norm_channels = model_config["norm_channels"]
        self.num_heads = model_config["num_heads"]

        assert self.mid_channels[0] == self.down_channels[-1]
        assert self.mid_channels[-1] == self.down_channels[-1]
        assert len(self.down_sample) == len(self.down_channels) - 1
        assert len(self.attns) == len(self.down_channels) - 1

        # Wherever we downsample in the encoder, use upsampling in the decoder at the corresponding location
        self.up_sample = list(reversed(self.down_sample))

        # Encoder
        self.encoder_conv_in = nn.Conv2d(
            in_channels=image_channels,
            out_channels=self.down_channels[0],
            kernel_size=3,
            stride=1,
            padding=1,
        )  # (batch_size, 3, h, w) -> (batch_size, c, h, w)

        # Downblock + Midblock
        self.encoder_layers = nn.ModuleList([])
        for i in range(len(self.down_channels) - 1):
            self.encoder_layers.append(
                DownBlock(
                    in_channels=self.down_channels[i],
                    out_channels=self.down_channels[i + 1],
                    t_emb_dim=None,
                    down_sample=self.down_sample[i],
                    num_heads=self.num_heads,
                    num_layers=self.num_down_layers,
                    attn=self.attns[i],
                    norm_channels=self.norm_channels,
                )
            )

        self.encoder_mids = nn.ModuleList([])
        for i in range(len(self.mid_channels) - 1):
            self.encoder_mids.append(
                MidBlock(
                    in_channels=self.mid_channels[i],
                    out_channels=self.mid_channels[i + 1],
                    t_emb_dim=None,
                    num_heads=self.num_heads,
                    num_layers=self.num_mid_layers,
                    norm_channels=self.norm_channels,
                )
            )

        self.encoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[-1])

        self.encoder_conv_out = nn.Conv2d(
            in_channels=self.down_channels[-1],
            out_channels=self.z_channels,
            kernel_size=3,
            stride=1,
            padding=1,
        )  # (batch_size, z_channels, h', w')

        # Pre Quantization Convolution
        self.pre_quant_conv = nn.Conv2d(
            in_channels=self.z_channels,
            out_channels=self.z_channels,
            kernel_size=1,
            stride=1,
            padding=0,
        )  # (batch_size, z_channels, h', w')

        # Codebook Vectors
        self.embedding = nn.Embedding(
            self.codebook_size, self.z_channels
        )  # (codebook_size, z_channels)

        # Decoder

        # Post Quantization Convolution
        self.post_quant_conv = nn.Conv2d(
            in_channels=self.z_channels,
            out_channels=self.z_channels,
            kernel_size=1,
            stride=1,
            padding=0,
        )  # (batch_size, z_channels, h', w')

        self.decoder_conv_in = nn.Conv2d(
            in_channels=self.z_channels,
            out_channels=self.mid_channels[-1],
            kernel_size=3,
            stride=1,
            padding=1,
        )  # (batch_size, c, h', w')

        # Midblock + Upblock
        self.decoder_mids = nn.ModuleList([])
        for i in reversed(range(1, len(self.mid_channels))):
            self.decoder_mids.append(
                MidBlock(
                    in_channels=self.mid_channels[i],
                    out_channels=self.mid_channels[i - 1],
                    t_emb_dim=None,
                    num_heads=self.num_heads,
                    num_layers=self.num_mid_layers,
                    norm_channels=self.norm_channels,
                )
            )

        self.decoder_layers = nn.ModuleList([])
        for i in reversed(range(1, len(self.down_channels))):
            self.decoder_layers.append(
                UpBlock(
                    in_channels=self.down_channels[i],
                    out_channels=self.down_channels[i - 1],
                    t_emb_dim=None,
                    up_sample=self.down_sample[i - 1],
                    num_heads=self.num_heads,
                    num_layers=self.num_up_layers,
                    attn=self.attns[i - 1],
                    norm_channels=self.norm_channels,
                )
            )

        self.decoder_norm_out = nn.GroupNorm(self.norm_channels, self.down_channels[0])

        self.decoder_conv_out = nn.Conv2d(
            in_channels=self.down_channels[0],
            out_channels=image_channels,
            kernel_size=3,
            stride=1,
            padding=1,
        )  # (batch_size, c, h, w)

    def quantize(self, x):
        batch_size, c, h, w = x.shape  # (batch_size, z_channels, h, w)

        x = x.permute(
            0, 2, 3, 1
        )  # (batch_size, z_channels, h, w) -> (batch_size, h, w, z_channels)
        x = x.reshape(
            batch_size, -1, c
        )  # (batch_size, h, w, z_channels) -> (batch_size, h * w, z_channels)

        # Find the nearest codebook vector with distance between (batch_size, h * w, z_channels) and (batch_size, code_book_size, z_channels) -> (batch_size, h * w, code_book_size)
        dist = torch.cdist(
            x, self.embedding.weight.unsqueeze(dim=0).repeat((batch_size, 1, 1))
        )  # cdist calculates the batched p-norm distance

        # (batch_size, h * w) Get the index of the closet codebook vector
        min_encoding_indices = torch.argmin(dist, dim=-1)

        # Replace the encoder output with the nearest codebook
        quant_out = torch.index_select(
            self.embedding.weight, 0, min_encoding_indices.view(-1)
        )  # (batch_size, h * w, z_channels)

        x = x.reshape((-1, c))  # (batch_size * h * w, z_channels)

        # Commitment and Codebook Loss using mSE
        commitment_loss = torch.mean((quant_out.detach() - x) ** 2)
        codebook_loss = torch.mean((quant_out - x.detach()) ** 2)

        quantize_losses = {
            "codebook_loss": codebook_loss,
            "commitment_loss": commitment_loss,
        }

        # Straight through estimation
        quant_out = x + (quant_out - x).detach()

        quant_out = quant_out.reshape(batch_size, h, w, c).permute(
            0, 3, 1, 2
        )  # (batch_size, z_channels, h, w)
        min_encoding_indices = min_encoding_indices.reshape(
            (-1, h, w)
        )  # (batch_size, h, w)

        return quant_out, quantize_losses, min_encoding_indices

    def encode(self, x):
        out = self.encoder_conv_in(x)  # (batch_size, self.down_channels[0], h, w)

        # (batch_size, self.down_channels[0], h, w) -> (batch_size, self.down_channels[-1], h', w')
        for idx, down in enumerate(self.encoder_layers):
            out = down(out)

        # (batch_size, self.down_channels[-1], h', w') -> (batch_size, self.mid_channels[-1], h', w')
        for mid in self.encoder_mids:
            out = mid(out)

        out = self.encoder_norm_out(out)
        out = F.silu(out)

        out = self.encoder_conv_out(
            out
        )  # (batch_size, self.mid_channels[-1], h', w') -> (batch_size, self.z_channels, h', w')
        out = self.pre_quant_conv(
            out
        )  # (batch_size, self.z_channels, h', w') -> (batch_size, self.z_channels, h', w')

        out, quant_losses, min_encoding_indices = self.quantize(
            out
        )  # (batch_size, self.z_channels, h', w'), (codebook_loss, commitment_loss), (batch_size, h, w)
        return out, quant_losses

    def decode(self, z):
        out = z
        out = self.post_quant_conv(
            out
        )  # (batch_size, self.z_channels, h', w') -> (batch_size, self.z_channels, h', w')
        out = self.decoder_conv_in(
            out
        )  # (batch_size, self.z_channels, h', w') -> (batch_size, self.mid_channels[-1], h', w')

        # (batch_size, self.mid_channels[-1], h', w') -> (batch_size, self.down_channels[-1], h', w')
        for mid in self.decoder_mids:
            out = mid(out)

        # (batch_size, self.down_channels[-1], h', w') -> (batch_size, self.down_channels[0], h, w)
        for idx, up in enumerate(self.decoder_layers):
            out = up(out)

        out = self.decoder_norm_out(out)
        out = F.silu(out)

        out = self.decoder_conv_out(
            out
        )  # (batch_size, self.down_channels[0], h, w) -> (batch_size, c, h, w)
        return out

    def forward(self, x):
        # x shape: (batch_size, c, h, w)

        z, quant_losses = self.encode(
            x
        )  # (batch_size, self.z_channels, h', w'), (codebook_loss, commitment_loss)
        out = self.decode(z)  # (batch_size, c, h, w)

        return out, z, quant_losses


def validate_image_conditional_input(cond_input, x):
    assert (
        "image" in cond_input
    ), "Model initialized with image conditioning but cond_input has no image information"
    assert (
        cond_input["image"].shape[0] == x.shape[0]
    ), "Batch size mismatch of image condition and input"
    assert (
        cond_input["image"].shape[2] % x.shape[2] == 0
    ), "Height/Width of image condition must be divisible by latent input"


def validate_class_conditional_input(cond_input, x, num_classes):
    assert (
        "class" in cond_input
    ), "Model initialized with class conditioning but cond_input has no class information"
    assert cond_input["class"].shape == (
        x.shape[0],
        num_classes,
    ), "Shape of class condition input must match (Batch Size, )"


def get_config_value(config, key, default_value):
    return config[key] if key in config else default_value


class UNet(nn.Module):
    """
    Unet model comprising
    Down blocks, Midblocks and Uplocks
    """

    def __init__(self, image_channels, model_config):
        super().__init__()

        self.down_channels = model_config["down_channels"]
        self.mid_channels = model_config["mid_channels"]
        self.t_emb_dim = model_config["time_emb_dim"]
        self.down_sample = model_config["down_sample"]
        self.num_down_layers = model_config["num_down_layers"]
        self.num_mid_layers = model_config["num_mid_layers"]
        self.num_up_layers = model_config["num_up_layers"]
        self.attns = model_config["attn_down"]
        self.norm_channels = model_config["norm_channels"]
        self.num_heads = model_config["num_heads"]
        self.conv_out_channels = model_config["conv_out_channels"]

        assert self.mid_channels[0] == self.down_channels[-1]
        assert self.mid_channels[-1] == self.down_channels[-2]
        assert len(self.down_sample) == len(self.down_channels) - 1
        assert len(self.attns) == len(self.down_channels) - 1

        # Class, Mask, and Text Conditioning Config
        self.class_cond = False
        self.text_cond = False
        self.image_cond = False
        self.text_embed_dim = None
        self.condition_config = get_config_value(
            model_config, "condition_config", None
        )  # Get the dictionary containing conditional information

        if self.condition_config is not None:
            assert (
                "condition_types" in self.condition_config
            ), "Condition Type not provided in model config"
            condition_types = self.condition_config["condition_types"]

            # For class, text, and image, get necessary parameters
            if "class" in condition_types:
                self.class_cond = True
                self.num_classes = self.condition_config["class_condition_config"][
                    "num_classes"
                ]

            if "text" in condition_types:
                self.text_cond = True
                self.text_embed_dim = self.condition_config["text_condition_config"][
                    "text_embed_dim"
                ]

            if "image" in condition_types:
                self.image_cond = True
                self.image_cond_input_channels = self.condition_config[
                    "image_condition_config"
                ]["image_condition_input_channels"]
                self.image_cond_output_channels = self.condition_config[
                    "image_condition_config"
                ]["image_condition_output_channels"]

        if self.class_cond:
            # For class conditioning, do not add the class embedding information for unconditional generation
            self.class_emb = nn.Embedding(
                self.num_classes, self.t_emb_dim
            )  # (num_classes, t_emb_dim)

        if self.image_cond:
            # Map the mask image to a image_cond_output_channels channel image, and concat with input across the channel dimension
            self.cond_conv_in = nn.Conv2d(
                in_channels=self.image_cond_input_channels,
                out_channels=self.image_cond_output_channels,
                kernel_size=1,
                stride=1,
                padding=0,
                bias=False,
            )

            self.conv_in_concat = nn.Conv2d(
                in_channels=(image_channels + self.image_cond_output_channels),
                out_channels=self.down_channels[0],
                kernel_size=3,
                stride=1,
                padding=1,
            )
        else:
            self.conv_in = nn.Conv2d(
                in_channels=image_channels,
                out_channels=self.down_channels[0],
                kernel_size=3,
                stride=1,
                padding=1,
            )  # (batch_size, image_channels, h, w) -> (batch_size, self.down_channels[0], h, w)

        self.cond = self.text_cond or self.image_cond or self.class_cond

        # Initial projection from sinusoidal time embedding
        self.t_proj = nn.Sequential(
            nn.Linear(in_features=self.t_emb_dim, out_features=self.t_emb_dim),
            nn.SiLU(),
            nn.Linear(in_features=self.t_emb_dim, out_features=self.t_emb_dim),
        )  # (batch_size, t_emb_dim)

        self.up_sample = list(reversed(self.down_sample))

        self.downs = nn.ModuleList([])
        for i in range(len(self.down_channels) - 1):
            # Cross attention and Context Dim are only used for text conditioning
            self.downs.append(
                DownBlock(
                    in_channels=self.down_channels[i],
                    out_channels=self.down_channels[i + 1],
                    t_emb_dim=self.t_emb_dim,
                    down_sample=self.down_sample[i],
                    num_heads=self.num_heads,
                    num_layers=self.num_down_layers,
                    attn=self.attns[i],
                    norm_channels=self.norm_channels,
                    cross_attn=self.text_cond,
                    context_dim=self.text_embed_dim,
                )
            )

        self.mids = nn.ModuleList([])
        for i in range(len(self.mid_channels) - 1):
            # Cross attention and Context Dim are only used for text conditioning
            self.mids.append(
                MidBlock(
                    in_channels=self.mid_channels[i],
                    out_channels=self.mid_channels[i + 1],
                    t_emb_dim=self.t_emb_dim,
                    num_heads=self.num_heads,
                    num_layers=self.num_mid_layers,
                    norm_channels=self.norm_channels,
                    cross_attn=self.text_cond,
                    context_dim=self.text_embed_dim,
                )
            )

        self.ups = nn.ModuleList([])
        for i in reversed(range(len(self.down_channels) - 1)):
            # Cross attention and Context Dim are only used for text conditioning
            self.ups.append(
                UpBlockUNet(
                    in_channels=(self.down_channels[i] * 2),
                    out_channels=(
                        self.down_channels[i - 1] if i != 0 else self.conv_out_channels
                    ),
                    t_emb_dim=self.t_emb_dim,
                    up_sample=self.down_sample[i],
                    num_heads=self.num_heads,
                    num_layers=self.num_up_layers,
                    norm_channels=self.norm_channels,
                    cross_attn=self.text_cond,
                    context_dim=self.text_embed_dim,
                )
            )

        self.norm_out = nn.GroupNorm(self.norm_channels, self.conv_out_channels)

        self.conv_out = nn.Conv2d(
            in_channels=self.conv_out_channels,
            out_channels=image_channels,
            kernel_size=3,
            stride=1,
            padding=1,
        )  # (batch_size, conv_out_channels, h, w) -> (batch_size, image_channels, h, w)

    def forward(self, x, t, cond_input=None):
        # x shape: (batch_size, c, h, w)
        # cond_input is the conditioning vector
        # For class conditioning, it will be a one-hot vector of size # (batch_size, num_classes)

        if self.cond:
            assert (
                cond_input is not None
            ), "Model initialized with conditioning so cond_input cannot be None"

        if self.image_cond:
            # Mask Conditioning
            validate_image_conditional_input(cond_input, x)
            image_cond = cond_input["image"]
            image_cond = F.interpolate(image_cond, size=x.shape[-2:])
            image_cond = self.cond_conv_in(image_cond)
            assert image_cond.shape[-2:] == x.shape[-2:]

            x = torch.cat(
                [x, image_cond], dim=1
            )  # (batch_size, image_channels + image_cond_output_channels, h, w)
            out = self.conv_in_concat(x)  # (batch_size, down_channels[0], h, w)
        else:
            out = self.conv_in(x)  # (batch_size, down_channels[0], h, w)

        t_emb = get_time_embedding(
            torch.as_tensor(t).long(), self.t_emb_dim
        )  # (batch_size, t_emb_dim)
        t_emb = self.t_proj(t_emb)  # (batch_size, t_emb_dim)

        # Class Conditioning
        if self.class_cond:
            validate_class_conditional_input(cond_input, x, self.num_classes)

            # Take the matrix for class embedding vectors and matrix multiply it with the embedding matrix to get the class embedding for all images in a batch
            class_embed = torch.matmul(
                cond_input["class"].float(), self.class_emb.weight
            )  # (batch_size, t_emb_dim)
            t_emb += class_embed  # Add the class embedding to the time embedding

        context_hidden_states = None

        # Only use context hidden states in cross-attention for text conditioning
        if self.text_cond:
            assert (
                "text" in cond_input
            ), "Model initialized with text conditioning but cond_input has no text information"
            context_hidden_states = cond_input["text"]

        down_outs = []
        for idx, down in enumerate(self.downs):
            down_outs.append(out)
            out = down(
                out, t_emb, context_hidden_states
            )  # Use context_hidden_states for cross-attention
        # out = (batch_size, c4, h / 4, w / 4)

        for mid in self.mids:
            out = mid(out, t_emb, context_hidden_states)
        # out = (batch_size, c3, h / 4, w / 4)

        for up in self.ups:
            down_out = down_outs.pop()
            out = up(out, down_out, t_emb, context_hidden_states)
        # out = (batch_size, self.conv_out_channels, h, w)

        out = F.silu(self.norm_out(out))
        out = self.conv_out(
            out
        )  # (batch_size, self.conv_out_channels, h, w) -> (batch_size, image_channels, h, w)

        return out  # (batch_size, image_channels, h, w)