Spaces:

RishabA
/

image-captioning

Sleeping

File size: 15,425 Bytes

import torch
import torch.nn as nn
import math


class ExtractPatches(nn.Module):
    def __init__(self, patch_size: int = 16):
        super().__init__()

        self.patch_size = patch_size
        self.unfold = nn.Unfold(kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        batch_size, c, h, w = x.shape

        # Unfold applies a slding window to generate patches
        # The transpose and reshape change the shape to (batch_size, num_patches, 3 * patch_size * patch_size), flattening the patches
        return (
            self.unfold(x)
            .transpose(1, 2)
            .reshape(batch_size, -1, c * self.patch_size * self.patch_size)
        )


# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        """
        super().__init__()

        # Intead of precomputing fixed values, we will compute in the forward pass based off of the sinusodiual encoding formula
        self.d_model = d_model

    def forward(self, x):
        device = x.device
        half_dim = self.d_model // 2  # Use half for sin and half for cos
        emb = math.log(10000.0) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]  # (batch_size, half_dim)
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb


# Multi-Head Self-Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int = 512, n_heads: int = 8, dropout: float = 0.1):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        n_heads: number of self attention heads per sequence
        dropout: probability of dropout
        """
        super().__init__()
        assert (
            d_model % n_heads == 0
        )  # We want to make sure that the dimensions are split evenly among the attention heads.
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_key = d_model // n_heads

        self.Wq = nn.Linear(d_model, d_model)  # Learnable weights for query
        self.Wk = nn.Linear(d_model, d_model)  # Learnable weights for key
        self.Wv = nn.Linear(d_model, d_model)  # Learnable weights for value
        self.Wo = nn.Linear(d_model, d_model)  # Learnable weights for output

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        """
        query: (batch_size, q_length, d_model)
        key: (batch_size, k_length, d_model)
        value: (batch_size, s_length, d_model)
        """
        batch_size = key.size(0)

        # Matrix multiplication for Q, K, and V tensors
        Q = self.Wq(query)
        K = self.Wk(key)
        V = self.Wv(value)

        # Split each tensor into heads
        Q = Q.view(batch_size, -1, self.n_heads, self.d_key).permute(
            0, 2, 1, 3
        )  # (batch_size, n_heads, q_length, d_key)
        K = K.view(batch_size, -1, self.n_heads, self.d_key).permute(
            0, 2, 1, 3
        )  # (batch_size, n_heads, k_length, d_key)
        V = V.view(batch_size, -1, self.n_heads, self.d_key).permute(
            0, 2, 1, 3
        )  # (batch_size, n_heads, v_length, d_key)

        # Scaled dot product
        # K^T becomees (batch_size, n_heads, d_key, k_length)
        scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(
            self.d_key
        )  # (batch_size, n_heads, q_length, k_length)

        if mask is not None:
            scaled_dot_product = scaled_dot_product.masked_fill(
                mask == 0, -float("inf")
            )  # Filling it with 0 would result in 1 after the mask because e^0 = 1. Intead we fill it with an infinitley large negative number

        # Softmax function for attention probabilities
        attention_probs = torch.softmax(scaled_dot_product, dim=-1)

        # Multiply by V to get attention with respect to the values
        A = torch.matmul(self.dropout(attention_probs), V)

        # Reshape attention back to (batch_size, q_length, d_model)
        A = (
            A.permute(0, 2, 1, 3)
            .contiguous()
            .view(batch_size, -1, self.n_heads * self.d_key)
        )

        # Pass through the final linear layer
        output = self.Wo(A)

        return (
            output,
            attention_probs,
        )  # Output shape: (batch_size, q_length, d_model), Attention probs shape: (batch_size, n_heads, q_length, k_length)


# Position-Wise Feed Forward Network (FFN)
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        dropout: probability of dropout
        """
        super().__init__()

        self.ffn = nn.Sequential(
            nn.Linear(in_features=d_model, out_features=(d_model * 4)),
            nn.ReLU(),
            nn.Linear(in_features=(d_model * 4), out_features=d_model),
            nn.Dropout(p=dropout),
        )

    def forward(self, x):
        return self.ffn(x)


# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        n_heads: number of self attention heads per sequence
        dropout: probability of dropout
        """
        super().__init__()

        # Multi-Head Self-Attention sublayer
        self.attention = MultiHeadAttention(
            d_model=d_model, n_heads=n_heads, dropout=dropout
        )
        self.attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization

        # Position-wise Feed-forward Network
        self.position_wise_ffn = PositionwiseFeedForward(
            d_model=d_model, dropout=dropout
        )
        self.ffn_layer_norm = nn.LayerNorm(d_model)  # Layer normalization

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, src):
        """
        src: embedded sequences (batch_size, seq_length, d_model)
        """
        # Multi-Head Attention

        _src, attention_probs = self.attention(
            src, src, src, None
        )  # Q, K, V, src_mask: we don't need a source mask because all images are the same dimension

        # Residual Addition and Layer Normalization
        src = self.attention_layer_norm(
            src + self.dropout(_src)
        )  # We do residual addition by adding back the src (the embeddings) to the output of Self-Attention

        # Position-wise Feed-forward Network
        _src = self.position_wise_ffn(src)

        # Residual Addition and Layer Normalization
        src = self.ffn_layer_norm(src + self.dropout(_src))

        return src, attention_probs


# The Encoder that takes in images and returns the encoding to be passed into the decoder
class Encoder(nn.Module):
    def __init__(
        self,
        image_size: int,
        in_channels: int,
        patch_size: int = 16,
        d_model: int = 128,
        n_layers: int = 3,
        n_heads: int = 4,
        dropout: float = 0.1,
    ):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        n_layers: number of encoder layers in the encoder block
        n_heads: number of self attention heads per sequence
        dropout: probability of dropout
        """
        super().__init__()

        self.patch_size = patch_size

        self.extract_patches = ExtractPatches(patch_size=patch_size)
        self.fc_in = nn.Linear(in_channels * patch_size * patch_size, d_model)

        seq_length = (image_size // patch_size) ** 2

        # Image src is going to use a learnable positional encoding
        self.pos_embedding = nn.Parameter(
            torch.empty(1, seq_length, d_model).normal_(std=0.02)
        )

        # Create n_layers encoders
        self.layers = nn.ModuleList(
            [
                EncoderLayer(d_model=d_model, n_heads=n_heads, dropout=dropout)
                for layer in range(n_layers)
            ]
        )
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, src):
        """
        src: embedded sequences (batch_size, seq_length, d_model)
        """

        # Extract the patches and apply a linear layer
        batch_size = src.shape[0]
        src = self.fc_in(self.extract_patches(src))

        # Add the learned positional embedding
        src = src + self.pos_embedding

        # Pass the sequences through each encoder layer
        for layer in self.layers:
            src, attention_probs = layer(src)

        self.attention_probs = attention_probs

        return src


# Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1):
        """
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        n_heads: number of self attention heads per sequence
        dropout: probability of dropout
        """
        super().__init__()

        # Masked Multi-Head Self-Attention sublayer
        self.masked_attention = MultiHeadAttention(
            d_model=d_model, n_heads=n_heads, dropout=dropout
        )
        self.masked_attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization

        # Multi-Head Self-Attention sublayer
        self.attention = MultiHeadAttention(
            d_model=d_model, n_heads=n_heads, dropout=dropout
        )
        self.attention_layer_norm = nn.LayerNorm(d_model)  # Layer normalization

        # Position-wise Feed-forward Network
        self.position_wise_ffn = PositionwiseFeedForward(
            d_model=d_model, dropout=dropout
        )
        self.ffn_layer_norm = nn.LayerNorm(d_model)  # Layer normalization

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, trg, src, trg_mask):
        """
        trg: embedded captions (batch_size, trg_seq_length, d_model)
        src: embedded images (batch_size, src_seq_length, d_model)
        trg_mask: mask for the captions preventing peeking at future tokens (batch_size, 1, trg_seq_length, trg_seq_length)
        """

        # Masked Multi-Head Attention

        # The target mask is used to prevent the model from seeing future tokens. This ensures that the prediction is made solely based on past and present tokens.
        _trg, masked_attention_probs = self.masked_attention(
            trg, trg, trg, trg_mask
        )  # Q, K, V, mask

        # Residual Addition and Layer Normalization
        trg = self.masked_attention_layer_norm(trg + self.dropout(_trg))

        # Multi-Head Attention - This time, we also pass in the output of the encoder layers as src.
        # This is important because this allows us to keep track of and learn relationships between the input and output tokens.
        _trg, attention_probs = self.attention(trg, src, src, None)  # Q, K, V, mask
        # Residual Addition and Layer Normalization
        trg = self.attention_layer_norm(trg + self.dropout(_trg))

        # Position-wise Feed-forward Network
        _trg = self.position_wise_ffn(trg)
        # Residual Addition and Layer Normalization
        trg = self.ffn_layer_norm(trg + self.dropout(_trg))

        return trg, attention_probs, masked_attention_probs


# The Decoder Module that takes the encoded images from the encoder and generates captions
class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        d_model: int = 128,
        n_layers: int = 3,
        n_heads: int = 4,
        dropout: float = 0.1,
    ):
        """
        vocab_size: size of the target vocabulary
        d_model: dimensions of the embeddings (number of values in each embedding vector)
        n_layers: number of encoder layers in the encoder block
        n_heads: number of self attention heads per sequence
        dropout: probability of dropout
        """
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.embedding.weight.data = 0.001 * self.embedding.weight.data

        # Initialize sinusoidal positional embeddings
        self.pos_emb = PositionalEncoding(d_model=d_model)

        # Create n_layers decoders
        self.layers = nn.ModuleList(
            [
                DecoderLayer(d_model=d_model, n_heads=n_heads, dropout=dropout)
                for layer in range(n_layers)
            ]
        )
        self.dropout = nn.Dropout(p=dropout)

        # Output layer
        self.Wo = nn.Linear(in_features=d_model, out_features=vocab_size)

    def make_trg_mask(self, trg):
        seq_length = trg.shape[1]

        trg_mask = torch.tril(
            torch.ones((seq_length, seq_length), device=trg.device)
        ).bool()

        return trg_mask.unsqueeze(0).unsqueeze(
            0
        )  # (batch_size=1, n_heads=1, seq_length, seq_length)

    def forward(self, trg, src):
        """
        trg: target sequences (batch_size, trg_seq_length, d_model)
        src: embedding images (batch_size, src_seq_length, d_model)
        """

        # Embed the target captions
        trg = self.embedding(trg)
        batch_size, l, h = trg.shape

        trg_index = torch.arange(l, device=trg.device)
        pos_emb = self.pos_emb(trg_index).reshape(1, l, h).expand(batch_size, l, h)
        # Add the fixed sinusodial positional embedding
        trg += pos_emb

        # Create a target mask for the target captions to prevent the model from peeking at future tokens
        trg_mask = self.make_trg_mask(
            trg
        )  # (batch_size, 1, trg_seq_length, trg_seq_length)

        # Pass the sequences through each decoder layer
        for layer in self.layers:
            trg, attention_probs, masked_attention_probs = layer(trg, src, trg_mask)

        self.attention_probs = attention_probs
        self.masked_attention_probs = masked_attention_probs  # (batch_size, n_heads, trg_seq_len, src_seq_len) trg_seq_len: length of the target caption \ src_seq_len: number of patches from the encoder

        # Final linear output layer
        return self.Wo(trg)


class CaptioningTransformer(nn.Module):
    def __init__(
        self,
        image_size: int,
        in_channels: int,
        vocab_size: int,
        device,
        patch_size: int = 16,
        d_model: int = 128,
        n_layers: int = 3,
        n_heads: int = 4,
    ):
        super().__init__()

        self.device = device

        # Create an encoder and decoder with specified parameters
        self.encoder = Encoder(
            image_size=image_size,
            in_channels=in_channels,
            patch_size=patch_size,
            d_model=d_model,
            n_layers=n_layers,
            n_heads=n_heads,
        )

        self.decoder = Decoder(
            vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_heads=n_heads
        )

    def forward(self, src, trg):
        # Encoder layers
        src = self.encoder(src)  # (batch_size, src_seq_length, d_model)

        # Decoder layers
        output = self.decoder(
            trg, src
        )  # Pass in both the target (for Masked Multi-Head Self-Attention) and source for (Cross-Attention)

        return output