Spaces:

jethrowang
/

SincVAD_Demo

Sleeping

App Files Files Community

jethrowang commited on Mar 25

Commit

1423dc8

verified ·

1 Parent(s): db481bf

Upload 18 files

Browse files

Files changed (18) hide show

model/.DS_Store +0 -0
model/csp_tiny_layer.py +86 -0
model/mamba_hf.py +41 -0
model/modules/.DS_Store +0 -0
model/modules/Conformer.py +1094 -0
model/modules/Conmamba.py +607 -0
model/modules/Transformer.py +1085 -0
model/modules/TransformerASR.py +682 -0
model/modules/__init__.py +0 -0
model/modules/mamba/.DS_Store +0 -0
model/modules/mamba/__init__.py +0 -0
model/modules/mamba/bimamba.py +465 -0
model/modules/mamba/mamba_blocks.py +252 -0
model/modules/mamba/selective_scan_interface.py +714 -0
model/patchify.py +20 -0
model/sinc_conv.py +471 -0
model/tiny_block.py +31 -0
model/tinyvad.py +62 -0

model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/csp_tiny_layer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn as nn
+from .tiny_block import TinyBlock
+from transformers import MambaConfig, MambaModel
+# from .conmamba import ConMamba
+class CSPTinyLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, num_blocks, ssm=False):
+        super(CSPTinyLayer, self).__init__()
+        self.ssm = ssm
+        # Split channels
+        self.split_channels = in_channels // 2
+        if self.ssm:
+            # Mamba Blocks
+            configuration = MambaConfig(vocab_size=0, hidden_size=self.split_channels, num_hidden_layers=num_blocks)
+            self.mamba_blocks = MambaModel(configuration)
+            # mamba_config = {
+            #     'd_state': self.split_channels,
+            #     'expand': 2,
+            #     'd_conv': 4,
+            #     'bidirectional': True
+            # }
+            # self.mamba_blocks = ConMamba(
+            #     num_blocks=num_blocks,
+            #     channels=self.split_channels,
+            #     height=8,
+            #     width=8,
+            #     mamba_config=mamba_config
+            # )
+        else:
+            # TinyBlocks
+            self.tiny_blocks = nn.Sequential(
+                *[TinyBlock(self.split_channels, self.split_channels) for _ in range(num_blocks)]
+            )
+        # Transition layer to adjust channel dimensions
+        self.transition = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        # Split input into two parts
+        p1 = x[:, :self.split_channels, :, :]
+        p2 = x[:, self.split_channels:, :, :]
+        if self.ssm:
+            # Reshape to fit Mamba
+            B, C, H, W = p2.shape
+            p2 = p2.permute(0, 2, 3, 1)  # [B, H, W, C]
+            p2 = p2.reshape(B, H * W, C)  # [B, L, C], L = H * W
+            # Process p2 through MambaBlocks
+            p2_out = self.mamba_blocks(inputs_embeds=p2).last_hidden_state
+            # p2_out = self.mamba_blocks(p2)
+            # Reshape back to original dimension
+            p2_out = p2_out.reshape(B, H, W, -1)
+            p2_out = p2_out.permute(0, 3, 1, 2)  # [B, C, H, W]
+        else:
+            # Process p2 through TinyBlocks
+            p2_out = self.tiny_blocks(p2)
+        # Concatenate p1 and processed p2
+        concatenated = torch.cat((p1, p2_out), dim=1)
+        # Apply transition layer
+        out = self.transition(concatenated)
+        return out
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    model = CSPTinyLayer(32, 32, 2, True).to(device)
+    print(model)
+    dummy_input = torch.randn(256, 32, 8, 8).to(device)
+    output = model(dummy_input)
+    print(output.shape)

model/mamba_hf.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+from transformers import MambaConfig, MambaModel, Mamba2Config, Mamba2Model
+print(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"CUDA device: {torch.cuda.get_device_name()}")
+    print(f"CUDA version: {torch.version.cuda}")
+batch, channel, height, width = 256, 16, 8, 8
+x = torch.randn(batch, channel, height, width).to("cuda")
+print(f'x: {x.shape}')
+B, C, H, W = x.shape
+x = x.permute(0, 2, 3, 1)  # [B, H, W, C]
+print(f'Permuted x: {x.shape}')
+x = x.reshape(B, H * W, C)  # [B, L, C], L = H * W
+print(f'Reshaped x: {x.shape}')
+# Initializing a Mamba configuration
+configuration = MambaConfig(vocab_size=0, hidden_size=channel, num_hidden_layers=2)
+# configuration = Mamba2Config(hidden_size=channel)
+# Initializing a model (with random weights) from the configuration
+model = MambaModel(configuration).to("cuda")
+# model = Mamba2Model(configuration).to("cuda")
+print(f'Model: {model}')
+# Accessing the model configuration
+configuration = model.config
+print(f'Configuration: {configuration}')
+# y = model(inputs_embeds=x).last_hidden_state
+y = model(inputs_embeds=x, return_dict=True)[0]
+print(f'y: {y.shape}')
+y = y.reshape(B, H, W, -1)
+print(f'Reshaped y: {y.shape}')
+y = y.permute(0, 3, 1, 2)  # [B, C, H, W]
+print(f'Permuted y: {y.shape}')

model/modules/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/modules/Conformer.py ADDED Viewed

	@@ -0,0 +1,1094 @@

+"""Conformer implementation.
+Authors
+-------
+* Jianyuan Zhong 2020
+* Samuele Cornell 2021
+* Sylvain de Langen 2023
+"""
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import speechbrain as sb
+from speechbrain.nnet.activations import Swish
+from speechbrain.nnet.attention import (
+    MultiheadAttention,
+    PositionalwiseFeedForward,
+    RelPosMHAXL,
+)
+from speechbrain.nnet.hypermixing import HyperMixing
+from speechbrain.nnet.normalization import LayerNorm
+from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+@dataclass
+class ConformerEncoderLayerStreamingContext:
+    """Streaming metadata and state for a `ConformerEncoderLayer`.
+    The multi-head attention and Dynamic Chunk Convolution require to save some
+    left context that gets inserted as left padding.
+    See :class:`.ConvolutionModule` documentation for further details.
+    """
+    mha_left_context_size: int
+    """For this layer, specifies how many frames of inputs should be saved.
+    Usually, the same value is used across all layers, but this can be modified.
+    """
+    mha_left_context: Optional[torch.Tensor] = None
+    """Left context to insert at the left of the current chunk as inputs to the
+    multi-head attention. It can be `None` (if we're dealing with the first
+    chunk) or `<= mha_left_context_size` because for the first few chunks, not
+    enough left context may be available to pad.
+    """
+    dcconv_left_context: Optional[torch.Tensor] = None
+    """Left context to insert at the left of the convolution according to the
+    Dynamic Chunk Convolution method.
+    Unlike `mha_left_context`, here the amount of frames to keep is fixed and
+    inferred from the kernel size of the convolution module.
+    """
+@dataclass
+class ConformerEncoderStreamingContext:
+    """Streaming metadata and state for a `ConformerEncoder`."""
+    dynchunktrain_config: DynChunkTrainConfig
+    """Dynamic Chunk Training configuration holding chunk size and context size
+    information."""
+    layers: List[ConformerEncoderLayerStreamingContext]
+    """Streaming metadata and state for each layer of the encoder."""
+class ConvolutionModule(nn.Module):
+    """This is an implementation of convolution module in Conformer.
+    Arguments
+    ---------
+    input_size : int
+        The expected size of the input embedding dimension.
+    kernel_size: int, optional
+        Kernel size of non-bottleneck convolutional layer.
+    bias: bool, optional
+        Whether to use bias in the non-bottleneck conv layer.
+    activation: torch.nn.Module
+         Activation function used after non-bottleneck conv layer.
+    dropout: float, optional
+         Dropout rate.
+    causal: bool, optional
+         Whether the convolution should be causal or not.
+    dilation: int, optional
+         Dilation factor for the non bottleneck conv layer.
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> net = ConvolutionModule(512, 3)
+    >>> output = net(x)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        input_size,
+        kernel_size=31,
+        bias=True,
+        activation=Swish,
+        dropout=0.0,
+        causal=False,
+        dilation=1,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.causal = causal
+        self.dilation = dilation
+        if self.causal:
+            self.padding = (kernel_size - 1) * 2 ** (dilation - 1)
+        else:
+            self.padding = (kernel_size - 1) * 2 ** (dilation - 1) // 2
+        self.layer_norm = nn.LayerNorm(input_size)
+        self.bottleneck = nn.Sequential(
+            # pointwise
+            nn.Conv1d(
+                input_size, 2 * input_size, kernel_size=1, stride=1, bias=bias
+            ),
+            nn.GLU(dim=1),
+        )
+        # depthwise
+        self.conv = nn.Conv1d(
+            input_size,
+            input_size,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=self.padding,
+            dilation=dilation,
+            groups=input_size,
+            bias=bias,
+        )
+        # BatchNorm in the original Conformer replaced with a LayerNorm due to
+        # https://github.com/speechbrain/speechbrain/pull/1329
+        # see discussion
+        # https://github.com/speechbrain/speechbrain/pull/933#issuecomment-1033367884
+        self.after_conv = nn.Sequential(
+            nn.LayerNorm(input_size),
+            activation(),
+            # pointwise
+            nn.Linear(input_size, input_size, bias=bias),
+            nn.Dropout(dropout),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """Applies the convolution to an input tensor `x`.
+        Arguments
+        ---------
+        x: torch.Tensor
+            Input tensor to the convolution module.
+        mask: torch.Tensor, optional
+            Mask to be applied over the output of the convolution using
+            `masked_fill_`, if specified.
+        dynchunktrain_config: DynChunkTrainConfig, optional
+            If specified, makes the module support Dynamic Chunk Convolution
+            (DCConv) as implemented by
+            `Dynamic Chunk Convolution for Unified Streaming and Non-Streaming Conformer ASR <https://www.amazon.science/publications/dynamic-chunk-convolution-for-unified-streaming-and-non-streaming-conformer-asr>`_.
+            This allows masking future frames while preserving better accuracy
+            than a fully causal convolution, at a small speed cost.
+            This should only be used for training (or, if you know what you're
+            doing, for masked evaluation at inference time), as the forward
+            streaming function should be used at inference time.
+        Returns
+        -------
+        out: torch.Tensor
+            The output tensor.
+        """
+        if dynchunktrain_config is not None:
+            # chances are chunking+causal is unintended; i don't know where it
+            # may make sense, but if it does to you, feel free to implement it.
+            assert (
+                not self.causal
+            ), "Chunked convolution not supported with causal padding"
+            assert (
+                self.dilation == 1
+            ), "Current DynChunkTrain logic does not support dilation != 1"
+            # in a causal convolution, which is not the case here, an output
+            # frame would never be able to depend on a input frame from any
+            # point in the future.
+            # but with the dynamic chunk convolution, we instead use a "normal"
+            # convolution but where, for any output frame, the future beyond the
+            # "current" chunk gets masked.
+            # see the paper linked in the documentation for details.
+            chunk_size = dynchunktrain_config.chunk_size
+            batch_size = x.shape[0]
+            # determine the amount of padding we need to insert at the right of
+            # the last chunk so that all chunks end up with the same size.
+            if x.shape[1] % chunk_size != 0:
+                final_right_padding = chunk_size - (x.shape[1] % chunk_size)
+            else:
+                final_right_padding = 0
+            # -> [batch_size, t, in_channels]
+            out = self.layer_norm(x)
+            # -> [batch_size, in_channels, t] for the CNN
+            out = out.transpose(1, 2)
+            # -> [batch_size, in_channels, t] (pointwise)
+            out = self.bottleneck(out)
+            # -> [batch_size, in_channels, lc+t+final_right_padding]
+            out = F.pad(out, (self.padding, final_right_padding), value=0)
+            # now, make chunks with left context.
+            # as a recap to what the above padding and this unfold do, consider
+            # each a/b/c letter represents a frame as part of chunks a, b, c.
+            # consider a chunk size of 4 and a kernel size of 5 (padding=2):
+            #
+            # input seq: 00aaaabbbbcc00
+            # chunk #1:  00aaaa
+            # chunk #2:      aabbbb
+            # chunk #3:          bbcc00
+            #
+            # a few remarks here:
+            # - the left padding gets inserted early so that the unfold logic
+            #   works trivially
+            # - the right 0-padding got inserted as the number of time steps
+            #   could not be evenly split in `chunk_size` chunks
+            # -> [batch_size, in_channels, num_chunks, lc+chunk_size]
+            out = out.unfold(2, size=chunk_size + self.padding, step=chunk_size)
+            # as we manually disable padding in the convolution below, we insert
+            # right 0-padding to the chunks, e.g. reusing the above example:
+            #
+            # chunk #1:  00aaaa00
+            # chunk #2:      aabbbb00
+            # chunk #3:          bbcc0000
+            # -> [batch_size, in_channels, num_chunks, lc+chunk_size+rpad]
+            out = F.pad(out, (0, self.padding), value=0)
+            # the transpose+flatten effectively flattens chunks into the batch
+            # dimension to be processed into the time-wise convolution. the
+            # chunks will later on be unflattened.
+            # -> [batch_size, num_chunks, in_channels, lc+chunk_size+rpad]
+            out = out.transpose(1, 2)
+            # -> [batch_size * num_chunks, in_channels, lc+chunk_size+rpad]
+            out = out.flatten(start_dim=0, end_dim=1)
+            # TODO: experiment around reflect padding, which is difficult
+            # because small chunks have too little time steps to reflect from
+            # let's keep backwards compat by pointing at the weights from the
+            # already declared Conv1d.
+            #
+            # still reusing the above example, the convolution will be applied,
+            # with the padding truncated on both ends. the following example
+            # shows the letter corresponding to the input frame on which the
+            # convolution was centered.
+            #
+            # as you can see, the sum of lengths of all chunks is equal to our
+            # input sequence length + `final_right_padding`.
+            #
+            # chunk #1:  aaaa
+            # chunk #2:      bbbb
+            # chunk #3:          cc00
+            # -> [batch_size * num_chunks, out_channels, chunk_size]
+            out = F.conv1d(
+                out,
+                weight=self.conv.weight,
+                bias=self.conv.bias,
+                stride=self.conv.stride,
+                padding=0,
+                dilation=self.conv.dilation,
+                groups=self.conv.groups,
+            )
+            # -> [batch_size * num_chunks, chunk_size, out_channels]
+            out = out.transpose(1, 2)
+            out = self.after_conv(out)
+            # -> [batch_size, num_chunks, chunk_size, out_channels]
+            out = torch.unflatten(out, dim=0, sizes=(batch_size, -1))
+            # -> [batch_size, t + final_right_padding, out_channels]
+            out = torch.flatten(out, start_dim=1, end_dim=2)
+            # -> [batch_size, t, out_channels]
+            if final_right_padding > 0:
+                out = out[:, :-final_right_padding, :]
+        else:
+            out = self.layer_norm(x)
+            out = out.transpose(1, 2)
+            out = self.bottleneck(out)
+            out = self.conv(out)
+            if self.causal:
+                # chomp
+                out = out[..., : -self.padding]
+            out = out.transpose(1, 2)
+            out = self.after_conv(out)
+        if mask is not None:
+            out.masked_fill_(mask, 0.0)
+        return out
+class ConformerEncoderLayer(nn.Module):
+    """This is an implementation of Conformer encoder layer.
+    Arguments
+    ---------
+    d_model : int
+        The expected size of the input embedding.
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    nhead : int
+        Number of attention heads.
+    kernel_size : int, optional
+        Kernel size of convolution model.
+    kdim : int, optional
+        Dimension of the key.
+    vdim : int, optional
+        Dimension of the value.
+    activation: torch.nn.Module
+         Activation function used in each Conformer layer.
+    bias : bool, optional
+        Whether  convolution module.
+    dropout : int, optional
+        Dropout for the encoder.
+    causal : bool, optional
+        Whether the convolutions should be causal or not.
+    attention_type : str, optional
+        type of attention layer, e.g. regularMHA for regular MultiHeadAttention.
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> pos_embs = torch.rand((1, 2*60-1, 512))
+    >>> net = ConformerEncoderLayer(d_ffn=512, nhead=8, d_model=512, kernel_size=3)
+    >>> output = net(x, pos_embs=pos_embs)
+    >>> output[0].shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        d_model,
+        d_ffn,
+        nhead,
+        kernel_size=31,
+        kdim=None,
+        vdim=None,
+        activation=Swish,
+        bias=True,
+        dropout=0.0,
+        causal=False,
+        attention_type="RelPosMHAXL",
+    ):
+        super().__init__()
+        if attention_type == "regularMHA":
+            self.mha_layer = MultiheadAttention(
+                nhead=nhead,
+                d_model=d_model,
+                dropout=dropout,
+                kdim=kdim,
+                vdim=vdim,
+            )
+        elif attention_type == "RelPosMHAXL":
+            # transformerXL style positional encoding
+            self.mha_layer = RelPosMHAXL(
+                num_heads=nhead,
+                embed_dim=d_model,
+                dropout=dropout,
+                mask_pos_future=causal,
+            )
+        elif attention_type == "hypermixing":
+            self.mha_layer = HyperMixing(
+                input_output_dim=d_model,
+                hypernet_size=d_ffn,
+                tied=False,
+                num_heads=nhead,
+                fix_tm_hidden_size=False,
+            )
+        self.convolution_module = ConvolutionModule(
+            d_model, kernel_size, bias, activation, dropout, causal=causal
+        )
+        self.ffn_module1 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.ffn_module2 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout)
+    def forward(
+        self,
+        x,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: torch.Tensor = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            The sequence to the encoder layer.
+        src_mask : torch.Tensor, optional
+            The mask for the src sequence.
+        src_key_padding_mask : torch.Tensor, optional
+            The mask for the src keys per batch.
+        pos_embs: torch.Tensor, torch.nn.Module, optional
+            Module or tensor containing the input sequence positional embeddings
+        dynchunktrain_config: Optional[DynChunkTrainConfig]
+            Dynamic Chunk Training configuration object for streaming,
+            specifically involved here to apply Dynamic Chunk Convolution to
+            the convolution module.
+        """
+        conv_mask: Optional[torch.Tensor] = None
+        if src_key_padding_mask is not None:
+            conv_mask = src_key_padding_mask.unsqueeze(-1)
+        # ffn module
+        x = x + 0.5 * self.ffn_module1(x)
+        # multi-head attention module
+        skip = x
+        x = self.norm1(x)
+        x, self_attn = self.mha_layer(
+            x,
+            x,
+            x,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask,
+            pos_embs=pos_embs,
+        )
+        x = x + skip
+        # convolution module
+        x = x + self.convolution_module(
+            x, conv_mask, dynchunktrain_config=dynchunktrain_config
+        )
+        # ffn module
+        x = self.norm2(x + 0.5 * self.ffn_module2(x))
+        return x, self_attn
+    def forward_streaming(
+        self,
+        x,
+        context: ConformerEncoderLayerStreamingContext,
+        pos_embs: torch.Tensor = None,
+    ):
+        """Conformer layer streaming forward (typically for
+        DynamicChunkTraining-trained models), which is to be used at inference
+        time. Relies on a mutable context object as initialized by
+        `make_streaming_context` that should be used across chunks.
+        Invoked by `ConformerEncoder.forward_streaming`.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor for this layer. Batching is supported as long as you
+            keep the context consistent.
+        context : ConformerEncoderStreamingContext
+            Mutable streaming context; the same object should be passed across
+            calls.
+        pos_embs : torch.Tensor, optional
+            Positional embeddings, if used.
+        Returns
+        -------
+        x : torch.Tensor
+            Output tensor.
+        self_attn : list
+            List of self attention values.
+        """
+        orig_len = x.shape[-2]
+        # ffn module
+        x = x + 0.5 * self.ffn_module1(x)
+        # TODO: make the approach for MHA left context more efficient.
+        # currently, this saves the inputs to the MHA.
+        # the naive approach is suboptimal in a few ways, namely that the
+        # outputs for this left padding is being re-computed even though we
+        # discard them immediately after.
+        # left pad `x` with our MHA left context
+        if context.mha_left_context is not None:
+            x = torch.cat((context.mha_left_context, x), dim=1)
+        # compute new MHA left context for the next call to our function
+        if context.mha_left_context_size > 0:
+            context.mha_left_context = x[
+                ..., -context.mha_left_context_size :, :
+            ]
+        # multi-head attention module
+        skip = x
+        x = self.norm1(x)
+        x, self_attn = self.mha_layer(
+            x,
+            x,
+            x,
+            attn_mask=None,
+            key_padding_mask=None,
+            pos_embs=pos_embs,
+        )
+        x = x + skip
+        # truncate outputs corresponding to the MHA left context (we only care
+        # about our chunk's outputs); see above to-do
+        x = x[..., -orig_len:, :]
+        if context.dcconv_left_context is not None:
+            x = torch.cat((context.dcconv_left_context, x), dim=1)
+        # compute new DCConv left context for the next call to our function
+        context.dcconv_left_context = x[
+            ..., -self.convolution_module.padding :, :
+        ]
+        # convolution module
+        x = x + self.convolution_module(x)
+        # truncate outputs corresponding to the DCConv left context
+        x = x[..., -orig_len:, :]
+        # ffn module
+        x = self.norm2(x + 0.5 * self.ffn_module2(x))
+        return x, self_attn
+    def make_streaming_context(self, mha_left_context_size: int):
+        """Creates a blank streaming context for this encoding layer.
+        Arguments
+        ---------
+        mha_left_context_size : int
+            How many left frames should be saved and used as left context to the
+            current chunk when streaming
+        Returns
+        -------
+        ConformerEncoderLayerStreamingContext
+        """
+        return ConformerEncoderLayerStreamingContext(
+            mha_left_context_size=mha_left_context_size
+        )
+class ConformerEncoder(nn.Module):
+    """This class implements the Conformer encoder.
+    Arguments
+    ---------
+    num_layers : int
+        Number of layers.
+    d_model : int
+        Embedding dimension size.
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    nhead : int
+        Number of attention heads.
+    kernel_size : int, optional
+        Kernel size of convolution model.
+    kdim : int, optional
+        Dimension of the key.
+    vdim : int, optional
+        Dimension of the value.
+    activation: torch.nn.Module
+         Activation function used in each Confomer layer.
+    bias : bool, optional
+        Whether  convolution module.
+    dropout : int, optional
+        Dropout for the encoder.
+    causal: bool, optional
+        Whether the convolutions should be causal or not.
+    attention_type: str, optional
+        type of attention layer, e.g. regularMHA for regular MultiHeadAttention.
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> pos_emb = torch.rand((1, 2*60-1, 512))
+    >>> net = ConformerEncoder(1, 512, 512, 8)
+    >>> output, _ = net(x, pos_embs=pos_emb)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        d_ffn,
+        nhead,
+        kernel_size=31,
+        kdim=None,
+        vdim=None,
+        activation=Swish,
+        bias=True,
+        dropout=0.0,
+        causal=False,
+        attention_type="RelPosMHAXL",
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                ConformerEncoderLayer(
+                    d_ffn=d_ffn,
+                    nhead=nhead,
+                    d_model=d_model,
+                    kdim=kdim,
+                    vdim=vdim,
+                    dropout=dropout,
+                    activation=activation,
+                    kernel_size=kernel_size,
+                    bias=bias,
+                    causal=causal,
+                    attention_type=attention_type,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LayerNorm(d_model, eps=1e-6)
+        self.attention_type = attention_type
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: Optional[torch.Tensor] = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            The sequence to the encoder layer.
+        src_mask : torch.Tensor, optional
+            The mask for the src sequence.
+        src_key_padding_mask : torch.Tensor, optional
+            The mask for the src keys per batch.
+        pos_embs: torch.Tensor, torch.nn.Module,
+            Module or tensor containing the input sequence positional embeddings
+            If custom pos_embs are given it needs to have the shape (1, 2*S-1, E)
+            where S is the sequence length, and E is the embedding dimension.
+        dynchunktrain_config: Optional[DynChunkTrainConfig]
+            Dynamic Chunk Training configuration object for streaming,
+            specifically involved here to apply Dynamic Chunk Convolution to the
+            convolution module.
+        """
+        if self.attention_type == "RelPosMHAXL":
+            if pos_embs is None:
+                raise ValueError(
+                    "The chosen attention type for the Conformer is RelPosMHAXL. For this attention type, the positional embeddings are mandatory"
+                )
+        output = src
+        attention_lst = []
+        for enc_layer in self.layers:
+            output, attention = enc_layer(
+                output,
+                src_mask=src_mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos_embs=pos_embs,
+                dynchunktrain_config=dynchunktrain_config,
+            )
+            attention_lst.append(attention)
+        output = self.norm(output)
+        return output, attention_lst
+    def forward_streaming(
+        self,
+        src: torch.Tensor,
+        context: ConformerEncoderStreamingContext,
+        pos_embs: Optional[torch.Tensor] = None,
+    ):
+        """Conformer streaming forward (typically for
+        DynamicChunkTraining-trained models), which is to be used at inference
+        time. Relies on a mutable context object as initialized by
+        `make_streaming_context` that should be used across chunks.
+        Arguments
+        ---------
+        src : torch.Tensor
+            Input tensor. Batching is supported as long as you keep the context
+            consistent.
+        context : ConformerEncoderStreamingContext
+            Mutable streaming context; the same object should be passed across
+            calls.
+        pos_embs : torch.Tensor, optional
+            Positional embeddings, if used.
+        Returns
+        -------
+        output : torch.Tensor
+            The output of the streaming conformer.
+        attention_lst : list
+            The attention values.
+        """
+        if self.attention_type == "RelPosMHAXL":
+            if pos_embs is None:
+                raise ValueError(
+                    "The chosen attention type for the Conformer is RelPosMHAXL. For this attention type, the positional embeddings are mandatory"
+                )
+        output = src
+        attention_lst = []
+        for i, enc_layer in enumerate(self.layers):
+            output, attention = enc_layer.forward_streaming(
+                output, pos_embs=pos_embs, context=context.layers[i]
+            )
+            attention_lst.append(attention)
+        output = self.norm(output)
+        return output, attention_lst
+    def make_streaming_context(self, dynchunktrain_config: DynChunkTrainConfig):
+        """Creates a blank streaming context for the encoder.
+        Arguments
+        ---------
+        dynchunktrain_config: Optional[DynChunkTrainConfig]
+            Dynamic Chunk Training configuration object for streaming
+        Returns
+        -------
+        ConformerEncoderStreamingContext
+        """
+        return ConformerEncoderStreamingContext(
+            dynchunktrain_config=dynchunktrain_config,
+            layers=[
+                layer.make_streaming_context(
+                    mha_left_context_size=dynchunktrain_config.left_context_size_frames()
+                )
+                for layer in self.layers
+            ],
+        )
+class ConformerDecoderLayer(nn.Module):
+    """This is an implementation of Conformer encoder layer.
+    Arguments
+    ---------
+    d_model : int
+        The expected size of the input embedding.
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    nhead : int
+        Number of attention heads.
+    kernel_size : int, optional
+        Kernel size of convolution model.
+    kdim : int, optional
+        Dimension of the key.
+    vdim : int, optional
+        Dimension of the value.
+    activation : torch.nn.Module, optional
+         Activation function used in each Conformer layer.
+    bias : bool, optional
+        Whether  convolution module.
+    dropout : int, optional
+        Dropout for the encoder.
+    causal : bool, optional
+        Whether the convolutions should be causal or not.
+    attention_type : str, optional
+        type of attention layer, e.g. regularMHA for regular MultiHeadAttention.
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> pos_embs = torch.rand((1, 2*60-1, 512))
+    >>> net = ConformerEncoderLayer(d_ffn=512, nhead=8, d_model=512, kernel_size=3)
+    >>> output = net(x, pos_embs=pos_embs)
+    >>> output[0].shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        d_model,
+        d_ffn,
+        nhead,
+        kernel_size,
+        kdim=None,
+        vdim=None,
+        activation=Swish,
+        bias=True,
+        dropout=0.0,
+        causal=True,
+        attention_type="RelPosMHAXL",
+    ):
+        super().__init__()
+        if not causal:
+            warnings.warn(
+                "Decoder is not causal, in most applications it should be causal, you have been warned !"
+            )
+        if attention_type == "regularMHA":
+            self.mha_layer = MultiheadAttention(
+                nhead=nhead,
+                d_model=d_model,
+                dropout=dropout,
+                kdim=kdim,
+                vdim=vdim,
+            )
+        elif attention_type == "RelPosMHAXL":
+            # transformerXL style positional encoding
+            self.mha_layer = RelPosMHAXL(
+                num_heads=nhead,
+                embed_dim=d_model,
+                dropout=dropout,
+                mask_pos_future=causal,
+            )
+        self.convolution_module = ConvolutionModule(
+            d_model, kernel_size, bias, activation, dropout, causal=causal
+        )
+        self.ffn_module1 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.ffn_module2 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout)
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ---------
+        tgt: torch.Tensor
+            The sequence to the decoder layer.
+        memory: torch.Tensor
+            The sequence from the last layer of the encoder.
+        tgt_mask: torch.Tensor, optional, optional
+            The mask for the tgt sequence.
+        memory_mask: torch.Tensor, optional
+            The mask for the memory sequence.
+        tgt_key_padding_mask: torch.Tensor, optional
+            The mask for the tgt keys per batch.
+        memory_key_padding_mask: torch.Tensor, optional
+            The mask for the memory keys per batch.
+        pos_embs_tgt: torch.Tensor, torch.nn.Module, optional
+            Module or tensor containing the target sequence positional embeddings for each attention layer.
+        pos_embs_src: torch.Tensor, torch.nn.Module, optional
+            Module or tensor containing the source sequence positional embeddings for each attention layer.
+        Returns
+        -------
+        x: torch.Tensor
+            The output tensor
+        self_attn : torch.Tensor
+        self_attn : torch.Tensor
+            The self attention tensor
+        """
+        # ffn module
+        tgt = tgt + 0.5 * self.ffn_module1(tgt)
+        # multi-head attention module
+        skip = tgt
+        x = self.norm1(tgt)
+        x, self_attn = self.mha_layer(
+            x,
+            memory,
+            memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+            pos_embs=pos_embs_src,
+        )
+        x = x + skip
+        # convolution module
+        x = x + self.convolution_module(x)
+        # ffn module
+        x = self.norm2(x + 0.5 * self.ffn_module2(x))
+        return x, self_attn, self_attn
+class ConformerDecoder(nn.Module):
+    """This class implements the Transformer decoder.
+    Arguments
+    ---------
+    num_layers: int
+        Number of layers.
+    nhead: int
+        Number of attention heads.
+    d_ffn: int
+        Hidden size of self-attention Feed Forward layer.
+    d_model: int
+        Embedding dimension size.
+    kdim: int, optional
+        Dimension for key.
+    vdim: int, optional
+        Dimension for value.
+    dropout: float, optional
+        Dropout rate.
+    activation: torch.nn.Module, optional
+        Activation function used after non-bottleneck conv layer.
+    kernel_size : int, optional
+        Kernel size of convolutional layer.
+    bias : bool, optional
+        Whether  convolution module.
+    causal: bool, optional
+        Whether the convolutions should be causal or not.
+    attention_type: str, optional
+        type of attention layer, e.g. regularMHA for regular MultiHeadAttention.
+    Example
+    -------
+    >>> src = torch.rand((8, 60, 512))
+    >>> tgt = torch.rand((8, 60, 512))
+    >>> net = ConformerDecoder(1, 8, 1024, 512, attention_type="regularMHA")
+    >>> output, _, _ = net(tgt, src)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_layers,
+        nhead,
+        d_ffn,
+        d_model,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        activation=Swish,
+        kernel_size=3,
+        bias=True,
+        causal=True,
+        attention_type="RelPosMHAXL",
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                ConformerDecoderLayer(
+                    d_ffn=d_ffn,
+                    nhead=nhead,
+                    d_model=d_model,
+                    kdim=kdim,
+                    vdim=vdim,
+                    dropout=dropout,
+                    activation=activation,
+                    kernel_size=kernel_size,
+                    bias=bias,
+                    causal=causal,
+                    attention_type=attention_type,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ---------
+        tgt: torch.Tensor
+            The sequence to the decoder layer.
+        memory: torch.Tensor
+            The sequence from the last layer of the encoder.
+        tgt_mask: torch.Tensor, optional, optional
+            The mask for the tgt sequence.
+        memory_mask: torch.Tensor, optional
+            The mask for the memory sequence.
+        tgt_key_padding_mask : torch.Tensor, optional
+            The mask for the tgt keys per batch.
+        memory_key_padding_mask : torch.Tensor, optional
+            The mask for the memory keys per batch.
+        pos_embs_tgt: torch.Tensor, torch.nn.Module, optional
+            Module or tensor containing the target sequence positional embeddings for each attention layer.
+        pos_embs_src: torch.Tensor, torch.nn.Module, optional
+            Module or tensor containing the source sequence positional embeddings for each attention layer.
+        Returns
+        -------
+        output: torch.Tensor
+            Conformer decoder output.
+        self_attns : list
+            Location of self attentions.
+        multihead_attns : list
+            Location of multihead attentions.
+        """
+        output = tgt
+        self_attns, multihead_attns = [], []
+        for dec_layer in self.layers:
+            output, self_attn, multihead_attn = dec_layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos_embs_tgt=pos_embs_tgt,
+                pos_embs_src=pos_embs_src,
+            )
+            self_attns.append(self_attn)
+            multihead_attns.append(multihead_attn)
+        output = self.norm(output)
+        return output, self_attns, multihead_attns

model/modules/Conmamba.py ADDED Viewed

	@@ -0,0 +1,607 @@

+"""ConMamba encoder and Mamba decoder implementation.
+Authors
+-------
+* Xilin Jiang 2024
+"""
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import speechbrain as sb
+from speechbrain.nnet.activations import Swish
+from speechbrain.nnet.attention import (
+    MultiheadAttention,
+    PositionalwiseFeedForward,
+    RelPosMHAXL,
+)
+from speechbrain.nnet.hypermixing import HyperMixing
+from speechbrain.nnet.normalization import LayerNorm
+from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+# Mamba
+from mamba_ssm import Mamba
+from .mamba.bimamba import Mamba as BiMamba
+class ConvolutionModule(nn.Module):
+    """This is an implementation of convolution module in Conmamba.
+    """
+    def __init__(
+        self,
+        input_size,
+        kernel_size=31,
+        bias=True,
+        activation=Swish,
+        dropout=0.0,
+        causal=False,
+        dilation=1,
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.causal = causal
+        self.dilation = dilation
+        if self.causal:
+            self.padding = (kernel_size - 1) * 2 ** (dilation - 1)
+        else:
+            self.padding = (kernel_size - 1) * 2 ** (dilation - 1) // 2
+        self.layer_norm = nn.LayerNorm(input_size)
+        self.bottleneck = nn.Sequential(
+            # pointwise
+            nn.Conv1d(
+                input_size, 2 * input_size, kernel_size=1, stride=1, bias=bias
+            ),
+            nn.GLU(dim=1),
+        )
+        # depthwise
+        self.conv = nn.Conv1d(
+            input_size,
+            input_size,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=self.padding,
+            dilation=dilation,
+            groups=input_size,
+            bias=bias,
+        )
+        # BatchNorm in the original Conformer replaced with a LayerNorm due to
+        # https://github.com/speechbrain/speechbrain/pull/1329
+        # see discussion
+        # https://github.com/speechbrain/speechbrain/pull/933#issuecomment-1033367884
+        self.after_conv = nn.Sequential(
+            nn.LayerNorm(input_size),
+            activation(),
+            # pointwise
+            nn.Linear(input_size, input_size, bias=bias),
+            nn.Dropout(dropout),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """Applies the convolution to an input tensor `x`.
+        """
+        if dynchunktrain_config is not None:
+            # chances are chunking+causal is unintended; i don't know where it
+            # may make sense, but if it does to you, feel free to implement it.
+            assert (
+                not self.causal
+            ), "Chunked convolution not supported with causal padding"
+            assert (
+                self.dilation == 1
+            ), "Current DynChunkTrain logic does not support dilation != 1"
+            # in a causal convolution, which is not the case here, an output
+            # frame would never be able to depend on a input frame from any
+            # point in the future.
+            # but with the dynamic chunk convolution, we instead use a "normal"
+            # convolution but where, for any output frame, the future beyond the
+            # "current" chunk gets masked.
+            # see the paper linked in the documentation for details.
+            chunk_size = dynchunktrain_config.chunk_size
+            batch_size = x.shape[0]
+            # determine the amount of padding we need to insert at the right of
+            # the last chunk so that all chunks end up with the same size.
+            if x.shape[1] % chunk_size != 0:
+                final_right_padding = chunk_size - (x.shape[1] % chunk_size)
+            else:
+                final_right_padding = 0
+            # -> [batch_size, t, in_channels]
+            out = self.layer_norm(x)
+            # -> [batch_size, in_channels, t] for the CNN
+            out = out.transpose(1, 2)
+            # -> [batch_size, in_channels, t] (pointwise)
+            out = self.bottleneck(out)
+            # -> [batch_size, in_channels, lc+t+final_right_padding]
+            out = F.pad(out, (self.padding, final_right_padding), value=0)
+            # now, make chunks with left context.
+            # as a recap to what the above padding and this unfold do, consider
+            # each a/b/c letter represents a frame as part of chunks a, b, c.
+            # consider a chunk size of 4 and a kernel size of 5 (padding=2):
+            #
+            # input seq: 00aaaabbbbcc00
+            # chunk #1:  00aaaa
+            # chunk #2:      aabbbb
+            # chunk #3:          bbcc00
+            #
+            # a few remarks here:
+            # - the left padding gets inserted early so that the unfold logic
+            #   works trivially
+            # - the right 0-padding got inserted as the number of time steps
+            #   could not be evenly split in `chunk_size` chunks
+            # -> [batch_size, in_channels, num_chunks, lc+chunk_size]
+            out = out.unfold(2, size=chunk_size + self.padding, step=chunk_size)
+            # as we manually disable padding in the convolution below, we insert
+            # right 0-padding to the chunks, e.g. reusing the above example:
+            #
+            # chunk #1:  00aaaa00
+            # chunk #2:      aabbbb00
+            # chunk #3:          bbcc0000
+            # -> [batch_size, in_channels, num_chunks, lc+chunk_size+rpad]
+            out = F.pad(out, (0, self.padding), value=0)
+            # the transpose+flatten effectively flattens chunks into the batch
+            # dimension to be processed into the time-wise convolution. the
+            # chunks will later on be unflattened.
+            # -> [batch_size, num_chunks, in_channels, lc+chunk_size+rpad]
+            out = out.transpose(1, 2)
+            # -> [batch_size * num_chunks, in_channels, lc+chunk_size+rpad]
+            out = out.flatten(start_dim=0, end_dim=1)
+            # TODO: experiment around reflect padding, which is difficult
+            # because small chunks have too little time steps to reflect from
+            # let's keep backwards compat by pointing at the weights from the
+            # already declared Conv1d.
+            #
+            # still reusing the above example, the convolution will be applied,
+            # with the padding truncated on both ends. the following example
+            # shows the letter corresponding to the input frame on which the
+            # convolution was centered.
+            #
+            # as you can see, the sum of lengths of all chunks is equal to our
+            # input sequence length + `final_right_padding`.
+            #
+            # chunk #1:  aaaa
+            # chunk #2:      bbbb
+            # chunk #3:          cc00
+            # -> [batch_size * num_chunks, out_channels, chunk_size]
+            out = F.conv1d(
+                out,
+                weight=self.conv.weight,
+                bias=self.conv.bias,
+                stride=self.conv.stride,
+                padding=0,
+                dilation=self.conv.dilation,
+                groups=self.conv.groups,
+            )
+            # -> [batch_size * num_chunks, chunk_size, out_channels]
+            out = out.transpose(1, 2)
+            out = self.after_conv(out)
+            # -> [batch_size, num_chunks, chunk_size, out_channels]
+            out = torch.unflatten(out, dim=0, sizes=(batch_size, -1))
+            # -> [batch_size, t + final_right_padding, out_channels]
+            out = torch.flatten(out, start_dim=1, end_dim=2)
+            # -> [batch_size, t, out_channels]
+            if final_right_padding > 0:
+                out = out[:, :-final_right_padding, :]
+        else:
+            out = self.layer_norm(x)
+            out = out.transpose(1, 2)
+            out = self.bottleneck(out)
+            out = self.conv(out)
+            if self.causal:
+                # chomp
+                out = out[..., : -self.padding]
+            out = out.transpose(1, 2)
+            out = self.after_conv(out)
+        if mask is not None:
+            out.masked_fill_(mask, 0.0)
+        return out
+class ConmambaEncoderLayer(nn.Module):
+    """This is an implementation of Conmamba encoder layer.
+    """
+    def __init__(
+        self,
+        d_model,
+        d_ffn,
+        kernel_size=31,
+        activation=Swish,
+        bias=True,
+        dropout=0.0,
+        causal=False,
+        mamba_config=None
+    ):
+        super().__init__()
+        assert mamba_config != None
+        bidirectional = mamba_config.pop('bidirectional')
+        if causal or (not bidirectional):
+            self.mamba = Mamba(
+                d_model=d_model,
+                **mamba_config
+            )
+        else:
+            self.mamba = BiMamba(
+                d_model=d_model,
+                bimamba_type='v2',
+                **mamba_config
+            )
+        mamba_config['bidirectional'] = bidirectional
+        self.convolution_module = ConvolutionModule(
+            d_model, kernel_size, bias, activation, dropout, causal=causal
+        )
+        self.ffn_module1 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.ffn_module2 = nn.Sequential(
+            nn.LayerNorm(d_model),
+            PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            ),
+            nn.Dropout(dropout),
+        )
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.drop = nn.Dropout(dropout)
+    def forward(
+        self,
+        x,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: torch.Tensor = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        conv_mask: Optional[torch.Tensor] = None
+        if src_key_padding_mask is not None:
+            conv_mask = src_key_padding_mask.unsqueeze(-1)
+        conv_mask = None
+        # ffn module
+        x = x + 0.5 * self.ffn_module1(x)
+        # mamba module
+        skip = x
+        x = self.norm1(x)
+        x = self.mamba(x)
+        x = x + skip
+        # convolution module
+        x = x + self.convolution_module(
+            x, conv_mask, dynchunktrain_config=dynchunktrain_config
+        )
+        # ffn module
+        x = self.norm2(x + 0.5 * self.ffn_module2(x))
+        return x
+class ConmambaEncoder(nn.Module):
+    """This class implements the Conmamba encoder.
+    """
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        d_ffn,
+        kernel_size=31,
+        activation=Swish,
+        bias=True,
+        dropout=0.0,
+        causal=False,
+        mamba_config=None
+    ):
+        super().__init__()
+        print(f'dropout={str(dropout)} is not used in Mamba.')
+        self.layers = torch.nn.ModuleList(
+            [
+                ConmambaEncoderLayer(
+                    d_model=d_model,
+                    d_ffn=d_ffn,
+                    dropout=dropout,
+                    activation=activation,
+                    kernel_size=kernel_size,
+                    bias=bias,
+                    causal=causal,
+                    mamba_config=mamba_config,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: Optional[torch.Tensor] = None,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            The sequence to the encoder layer.
+        src_mask : torch.Tensor, optional
+            The mask for the src sequence.
+        src_key_padding_mask : torch.Tensor, optional
+            The mask for the src keys per batch.
+        pos_embs: torch.Tensor, torch.nn.Module,
+            Module or tensor containing the input sequence positional embeddings
+            If custom pos_embs are given it needs to have the shape (1, 2*S-1, E)
+            where S is the sequence length, and E is the embedding dimension.
+        dynchunktrain_config: Optional[DynChunkTrainConfig]
+            Dynamic Chunk Training configuration object for streaming,
+            specifically involved here to apply Dynamic Chunk Convolution to the
+            convolution module.
+        """
+        output = src
+        for enc_layer in self.layers:
+            output = enc_layer(
+                output,
+                src_mask=src_mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos_embs=pos_embs,
+                dynchunktrain_config=dynchunktrain_config,
+            )
+        output = self.norm(output)
+        return output, None
+class MambaDecoderLayer(nn.Module):
+    """This class implements the Mamba decoder layer.
+    """
+    def __init__(
+        self,
+        d_model,
+        d_ffn,
+        activation=nn.ReLU,
+        dropout=0.0,
+        normalize_before=False,
+        mamba_config=None
+    ):
+        super().__init__()
+        assert mamba_config != None
+        bidirectional = mamba_config.pop('bidirectional')
+        self.self_mamba = Mamba(
+            d_model=d_model,
+            **mamba_config
+        )
+        self.cross_mamba = Mamba(
+            d_model=d_model,
+            **mamba_config
+        )
+        mamba_config['bidirectional'] = bidirectional
+        self.pos_ffn = sb.nnet.attention.PositionalwiseFeedForward(
+            d_ffn=d_ffn,
+            input_size=d_model,
+            dropout=dropout,
+            activation=activation,
+        )
+        # normalization layers
+        self.norm1 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.norm2 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.norm3 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.dropout3 = torch.nn.Dropout(dropout)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ----------
+        tgt: torch.Tensor
+            The sequence to the decoder layer (required).
+        memory: torch.Tensor
+            The sequence from the last layer of the encoder (required).
+        tgt_mask: torch.Tensor
+            The mask for the tgt sequence (optional).
+        memory_mask: torch.Tensor
+            The mask for the memory sequence (optional).
+        tgt_key_padding_mask: torch.Tensor
+            The mask for the tgt keys per batch (optional).
+        memory_key_padding_mask: torch.Tensor
+            The mask for the memory keys per batch (optional).
+        pos_embs_tgt: torch.Tensor
+            The positional embeddings for the target (optional).
+        pos_embs_src: torch.Tensor
+            The positional embeddings for the source (optional).
+        """
+        if self.normalize_before:
+            tgt1 = self.norm1(tgt)
+        else:
+            tgt1 = tgt
+        # Mamba over the target sequence
+        tgt2 = self.self_mamba(tgt1)
+        # add & norm
+        tgt = tgt + self.dropout1(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+        if self.normalize_before:
+            tgt1 = self.norm2(tgt)
+        else:
+            tgt1 = tgt
+        # Mamba over key=value + query
+        # and only take the last len(query) tokens
+        tgt2 = self.cross_mamba(torch.cat([memory, tgt1], dim=1))[:, -tgt1.shape[1]:]
+        # add & norm
+        tgt = tgt + self.dropout2(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+        if self.normalize_before:
+            tgt1 = self.norm3(tgt)
+        else:
+            tgt1 = tgt
+        tgt2 = self.pos_ffn(tgt1)
+        # add & norm
+        tgt = tgt + self.dropout3(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt, None, None
+class MambaDecoder(nn.Module):
+    """This class implements the Mamba decoder.
+    """
+    def __init__(
+        self,
+        num_layers,
+        d_model,
+        d_ffn,
+        activation=nn.ReLU,
+        dropout=0.0,
+        normalize_before=False,
+        mamba_config=None
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                MambaDecoderLayer(
+                    d_model=d_model,
+                    d_ffn=d_ffn,
+                    activation=activation,
+                    dropout=dropout,
+                    normalize_before=normalize_before,
+                    mamba_config=mamba_config
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ----------
+        tgt : torch.Tensor
+            The sequence to the decoder layer (required).
+        memory : torch.Tensor
+            The sequence from the last layer of the encoder (required).
+        tgt_mask : torch.Tensor
+            The mask for the tgt sequence (optional).
+        memory_mask : torch.Tensor
+            The mask for the memory sequence (optional).
+        tgt_key_padding_mask : torch.Tensor
+            The mask for the tgt keys per batch (optional).
+        memory_key_padding_mask : torch.Tensor
+            The mask for the memory keys per batch (optional).
+        pos_embs_tgt : torch.Tensor
+            The positional embeddings for the target (optional).
+        pos_embs_src : torch.Tensor
+            The positional embeddings for the source (optional).
+        """
+        output = tgt
+        for dec_layer in self.layers:
+            output, _, _ = dec_layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos_embs_tgt=pos_embs_tgt,
+                pos_embs_src=pos_embs_src,
+            )
+        output = self.norm(output)
+        return output, [None], [None]

model/modules/Transformer.py ADDED Viewed

	@@ -0,0 +1,1085 @@

+"""Added ConMamba and Mamba
+Authors
+* Xilin Jiang 2024
+"""
+"""Transformer implementation in the SpeechBrain style.
+Authors
+* Jianyuan Zhong 2020
+* Samuele Cornell 2021
+"""
+import math
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import speechbrain as sb
+from speechbrain.nnet.activations import Swish
+from speechbrain.nnet.attention import RelPosEncXL
+from speechbrain.nnet.CNN import Conv1d
+from modules.Conformer import ConformerEncoder
+from modules.Conmamba import ConmambaEncoder, MambaDecoder
+class TransformerInterface(nn.Module):
+    """This is an interface for transformer model.
+    Users can modify the attributes and define the forward function as
+    needed according to their own tasks.
+    The architecture is based on the paper "Attention Is All You Need":
+    https://arxiv.org/pdf/1706.03762.pdf
+    Arguments
+    ---------
+    d_model: int
+        The number of expected features in the encoder/decoder inputs (default=512).
+    nhead: int
+        The number of heads in the multi-head attention models (default=8).
+    num_encoder_layers: int, optional
+        The number of encoder layers in1ì the encoder.
+    num_decoder_layers: int, optional
+        The number of decoder layers in the decoder.
+    d_ffn: int, optional
+        The dimension of the feedforward network model hidden layer.
+    dropout: int, optional
+        The dropout value.
+    activation: torch.nn.Module, optional
+        The activation function for Feed-Forward Network layer,
+        e.g., relu or gelu or swish.
+    custom_src_module: torch.nn.Module, optional
+        Module that processes the src features to expected feature dim.
+    custom_tgt_module: torch.nn.Module, optional
+        Module that processes the src features to expected feature dim.
+    positional_encoding: str, optional
+        Type of positional encoding used. e.g. 'fixed_abs_sine' for fixed absolute positional encodings.
+    normalize_before: bool, optional
+        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
+        Defaults to True as this was shown to lead to better performance and training stability.
+    kernel_size: int, optional
+        Kernel size in convolutional layers when Conformer is used.
+    bias: bool, optional
+        Whether to use bias in Conformer convolutional layers.
+    encoder_module: str, optional
+        Choose between Branchformer, Conformer, ConMamba, and Transformer for the encoder.
+    decoder_module: str, optional
+        Choose between Mamba and Transformer for the decoder.
+    conformer_activation: torch.nn.Module, optional
+        Activation module used after Conformer convolutional layers. E.g. Swish, ReLU etc. it has to be a torch Module.
+    branchformer_activation: torch.nn.Module, optional
+        Activation module used within the Branchformer Encoder. E.g. Swish, ReLU etc. it has to be a torch Module.
+    attention_type: str, optional
+        Type of attention layer used in all Transformer or Conformer layers.
+        e.g. regularMHA or RelPosMHA.
+    max_length: int, optional
+        Max length for the target and source sequence in input.
+        Used for positional encodings.
+    causal: bool, optional
+        Whether the encoder should be causal or not (the decoder is always causal).
+        If causal the Conformer convolutional layer is causal.
+    encoder_kdim: int, optional
+        Dimension of the key for the encoder.
+    encoder_vdim: int, optional
+        Dimension of the value for the encoder.
+    decoder_kdim: int, optional
+        Dimension of the key for the decoder.
+    decoder_vdim: int, optional
+        Dimension of the value for the decoder.
+    csgu_linear_units: int, optional
+        Number of neurons in the hidden linear units of the CSGU Module.
+        -> Branchformer
+    gate_activation: torch.nn.Module, optional
+        Activation function used at the gate of the CSGU module.
+        -> Branchformer
+    use_linear_after_conv: bool, optional
+        If True, will apply a linear transformation of size input_size//2.
+        -> Branchformer
+    mamba_config: dict, optional
+        Mamba parameters if encoder_module or decoder_module is Mamba or ConMamba
+    """
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        d_ffn=2048,
+        dropout=0.1,
+        activation=nn.ReLU,
+        custom_src_module=None,
+        custom_tgt_module=None,
+        positional_encoding="fixed_abs_sine",
+        normalize_before=True,
+        kernel_size: Optional[int] = 31,
+        bias: Optional[bool] = True,
+        encoder_module: Optional[str] = "transformer",
+        decoder_module: Optional[str] = "transformer",
+        conformer_activation: Optional[nn.Module] = Swish,
+        branchformer_activation: Optional[nn.Module] = nn.GELU,
+        attention_type: Optional[str] = "regularMHA",
+        max_length: Optional[int] = 2500,
+        causal: Optional[bool] = False,
+        encoder_kdim: Optional[int] = None,
+        encoder_vdim: Optional[int] = None,
+        decoder_kdim: Optional[int] = None,
+        decoder_vdim: Optional[int] = None,
+        csgu_linear_units: Optional[int] = 3072,
+        gate_activation: Optional[nn.Module] = nn.Identity,
+        use_linear_after_conv: Optional[bool] = False,
+        mamba_config=None
+    ):
+        super().__init__()
+        self.causal = causal
+        self.attention_type = attention_type
+        self.positional_encoding_type = positional_encoding
+        self.encoder_kdim = encoder_kdim
+        self.encoder_vdim = encoder_vdim
+        self.decoder_kdim = decoder_kdim
+        self.decoder_vdim = decoder_vdim
+        assert attention_type in ["regularMHA", "RelPosMHAXL", "hypermixing"]
+        assert positional_encoding in ["fixed_abs_sine", None]
+        assert (
+            num_encoder_layers + num_decoder_layers > 0
+        ), "number of encoder layers and number of decoder layers cannot both be 0!"
+        if positional_encoding == "fixed_abs_sine":
+            self.positional_encoding = PositionalEncoding(d_model, max_length)
+        elif positional_encoding is None:
+            pass
+            # no positional encodings
+        # overrides any other pos_embedding
+        if attention_type == "RelPosMHAXL":
+            self.positional_encoding = RelPosEncXL(d_model)
+            self.positional_encoding_decoder = PositionalEncoding(
+                d_model, max_length
+            )
+        # initialize the encoder
+        if num_encoder_layers > 0:
+            if custom_src_module is not None:
+                self.custom_src_module = custom_src_module(d_model)
+            if encoder_module == "transformer":
+                self.encoder = TransformerEncoder(
+                    nhead=nhead,
+                    num_layers=num_encoder_layers,
+                    d_ffn=d_ffn,
+                    d_model=d_model,
+                    dropout=dropout,
+                    activation=activation,
+                    normalize_before=normalize_before,
+                    causal=self.causal,
+                    attention_type=self.attention_type,
+                    kdim=self.encoder_kdim,
+                    vdim=self.encoder_vdim,
+                )
+            elif encoder_module == "conformer":
+                self.encoder = ConformerEncoder(
+                    nhead=nhead,
+                    num_layers=num_encoder_layers,
+                    d_ffn=d_ffn,
+                    d_model=d_model,
+                    dropout=dropout,
+                    activation=conformer_activation,
+                    kernel_size=kernel_size,
+                    bias=bias,
+                    causal=self.causal,
+                    attention_type=self.attention_type,
+                )
+                assert (
+                    normalize_before
+                ), "normalize_before must be True for Conformer"
+                assert (
+                    conformer_activation is not None
+                ), "conformer_activation must not be None"
+            elif encoder_module == "branchformer":
+                self.encoder = BranchformerEncoder(
+                    nhead=nhead,
+                    num_layers=num_encoder_layers,
+                    d_model=d_model,
+                    dropout=dropout,
+                    activation=branchformer_activation,
+                    kernel_size=kernel_size,
+                    attention_type=self.attention_type,
+                    csgu_linear_units=csgu_linear_units,
+                    gate_activation=gate_activation,
+                    use_linear_after_conv=use_linear_after_conv,
+                )
+            elif encoder_module == "conmamba":
+                self.encoder = ConmambaEncoder(
+                    num_layers=num_encoder_layers,
+                    d_model=d_model,
+                    d_ffn=d_ffn,
+                    dropout=dropout,
+                    activation=branchformer_activation,
+                    kernel_size=kernel_size,
+                    bias=bias,
+                    causal=self.causal,
+                    mamba_config=mamba_config
+                )
+                assert (
+                    normalize_before
+                ), "normalize_before must be True for Conmamba"
+                assert (
+                    conformer_activation is not None
+                ), "conformer_activation must not be None"
+        # initialize the decoder
+        if num_decoder_layers > 0:
+            if custom_tgt_module is not None:
+                self.custom_tgt_module = custom_tgt_module(d_model)
+            if decoder_module == 'transformer':
+                self.decoder = TransformerDecoder(
+                    num_layers=num_decoder_layers,
+                    nhead=nhead,
+                    d_ffn=d_ffn,
+                    d_model=d_model,
+                    dropout=dropout,
+                    activation=activation,
+                    normalize_before=normalize_before,
+                    causal=True,
+                    attention_type="regularMHA",  # always use regular attention in decoder
+                    kdim=self.decoder_kdim,
+                    vdim=self.decoder_vdim,
+                )
+            elif decoder_module in ['mamba']:
+                self.decoder = MambaDecoder(
+                    num_layers=num_decoder_layers,
+                    d_ffn=d_ffn,
+                    d_model=d_model,
+                    activation=activation,
+                    dropout=dropout,
+                    normalize_before=normalize_before,
+                    mamba_config=mamba_config
+                )
+            else:
+                raise NotImplementedError(decoder_module)
+    def forward(self, **kwags):
+        """Users should modify this function according to their own tasks."""
+        raise NotImplementedError
+class PositionalEncoding(nn.Module):
+    """This class implements the absolute sinusoidal positional encoding function.
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    Arguments
+    ---------
+    input_size: int
+        Embedding dimension.
+    max_len : int, optional
+        Max length of the input sequences (default 2500).
+    Example
+    -------
+    >>> a = torch.rand((8, 120, 512))
+    >>> enc = PositionalEncoding(input_size=a.shape[-1])
+    >>> b = enc(a)
+    >>> b.shape
+    torch.Size([1, 120, 512])
+    """
+    def __init__(self, input_size, max_len=2500):
+        super().__init__()
+        if input_size % 2 != 0:
+            raise ValueError(
+                f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})"
+            )
+        self.max_len = max_len
+        pe = torch.zeros(self.max_len, input_size, requires_grad=False)
+        positions = torch.arange(0, self.max_len).unsqueeze(1).float()
+        denominator = torch.exp(
+            torch.arange(0, input_size, 2).float()
+            * -(math.log(10000.0) / input_size)
+        )
+        pe[:, 0::2] = torch.sin(positions * denominator)
+        pe[:, 1::2] = torch.cos(positions * denominator)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        """
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input feature shape (batch, time, fea)
+        Returns
+        -------
+        The positional encoding.
+        """
+        return self.pe[:, : x.size(1)].clone().detach()
+class TransformerEncoderLayer(nn.Module):
+    """This is an implementation of self-attention encoder layer.
+    Arguments
+    ---------
+    d_ffn: int, optional
+        The dimension of the feedforward network model hidden layer.
+    nhead: int
+        The number of heads in the multi-head attention models (default=8).
+    d_model: int
+        The number of expected features in the encoder/decoder inputs (default=512).
+    kdim: int, optional
+        Dimension of the key.
+    vdim: int, optional
+        Dimension of the value.
+    dropout: int, optional
+        The dropout value.
+    activation: torch.nn.Module, optional
+        The activation function for Feed-Forward Network layer,
+        e.g., relu or gelu or swish.
+    normalize_before: bool, optional
+        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
+        Defaults to True as this was shown to lead to better performance and training stability.
+    attention_type: str, optional
+        Type of attention layer used in all Transformer or Conformer layers.
+        e.g. regularMHA or RelPosMHA.
+    ffn_type: str
+        type of ffn: regularFFN/1dcnn
+    ffn_cnn_kernel_size_list: list of int
+        kernel size of 2 1d-convs if ffn_type is 1dcnn
+    causal: bool, optional
+        Whether the encoder should be causal or not (the decoder is always causal).
+        If causal the Conformer convolutional layer is causal.
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> net = TransformerEncoderLayer(512, 8, d_model=512)
+    >>> output = net(x)
+    >>> output[0].shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        d_ffn,
+        nhead,
+        d_model,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        activation=nn.ReLU,
+        normalize_before=False,
+        attention_type="regularMHA",
+        ffn_type="regularFFN",
+        ffn_cnn_kernel_size_list=[3, 3],
+        causal=False,
+    ):
+        super().__init__()
+        if attention_type == "regularMHA":
+            self.self_att = sb.nnet.attention.MultiheadAttention(
+                nhead=nhead,
+                d_model=d_model,
+                dropout=dropout,
+                kdim=kdim,
+                vdim=vdim,
+            )
+        elif attention_type == "RelPosMHAXL":
+            self.self_att = sb.nnet.attention.RelPosMHAXL(
+                d_model, nhead, dropout, mask_pos_future=causal
+            )
+        elif attention_type == "hypermixing":
+            self.self_att = sb.nnet.hypermixing.HyperMixing(
+                input_output_dim=d_model,
+                hypernet_size=d_ffn,
+                tied=False,
+                num_heads=nhead,
+                fix_tm_hidden_size=False,
+            )
+        if ffn_type == "regularFFN":
+            self.pos_ffn = sb.nnet.attention.PositionalwiseFeedForward(
+                d_ffn=d_ffn,
+                input_size=d_model,
+                dropout=dropout,
+                activation=activation,
+            )
+        elif ffn_type == "1dcnn":
+            self.pos_ffn = nn.Sequential(
+                Conv1d(
+                    in_channels=d_model,
+                    out_channels=d_ffn,
+                    kernel_size=ffn_cnn_kernel_size_list[0],
+                    padding="causal" if causal else "same",
+                ),
+                nn.ReLU(),
+                Conv1d(
+                    in_channels=d_ffn,
+                    out_channels=d_model,
+                    kernel_size=ffn_cnn_kernel_size_list[1],
+                    padding="causal" if causal else "same",
+                ),
+            )
+        self.norm1 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.norm2 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.normalize_before = normalize_before
+        self.pos_ffn_type = ffn_type
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: Optional[torch.Tensor] = None,
+    ):
+        """
+        Arguments
+        ---------
+        src : torch.Tensor
+            The sequence to the encoder layer.
+        src_mask : torch.Tensor
+            The mask for the src query for each example in the batch.
+        src_key_padding_mask : torch.Tensor, optional
+            The mask for the src keys for each example in the batch.
+        pos_embs: torch.Tensor, optional
+            The positional embeddings tensor.
+        Returns
+        -------
+        output : torch.Tensor
+            The output of the transformer encoder layer.
+        """
+        if self.normalize_before:
+            src1 = self.norm1(src)
+        else:
+            src1 = src
+        output, self_attn = self.self_att(
+            src1,
+            src1,
+            src1,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask,
+            pos_embs=pos_embs,
+        )
+        # add & norm
+        src = src + self.dropout1(output)
+        if not self.normalize_before:
+            src = self.norm1(src)
+        if self.normalize_before:
+            src1 = self.norm2(src)
+        else:
+            src1 = src
+        output = self.pos_ffn(src1)
+        # add & norm
+        output = src + self.dropout2(output)
+        if not self.normalize_before:
+            output = self.norm2(output)
+        return output, self_attn
+class TransformerEncoder(nn.Module):
+    """This class implements the transformer encoder.
+    Arguments
+    ---------
+    num_layers : int
+        Number of transformer layers to include.
+    nhead : int
+        Number of attention heads.
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    input_shape : tuple
+        Expected shape of the input.
+    d_model : int
+        The dimension of the input embedding.
+    kdim : int
+        Dimension for key (Optional).
+    vdim : int
+        Dimension for value (Optional).
+    dropout : float
+        Dropout for the encoder (Optional).
+    activation: torch.nn.Module, optional
+        The activation function for Feed-Forward Network layer,
+        e.g., relu or gelu or swish.
+    normalize_before: bool, optional
+        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
+        Defaults to True as this was shown to lead to better performance and training stability.
+    causal: bool, optional
+        Whether the encoder should be causal or not (the decoder is always causal).
+        If causal the Conformer convolutional layer is causal.
+    layerdrop_prob: float
+        The probability to drop an entire layer
+    attention_type: str, optional
+        Type of attention layer used in all Transformer or Conformer layers.
+        e.g. regularMHA or RelPosMHA.
+    ffn_type: str
+        type of ffn: regularFFN/1dcnn
+    ffn_cnn_kernel_size_list: list of int
+        conv kernel size of 2 1d-convs if ffn_type is 1dcnn
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> net = TransformerEncoder(1, 8, 512, d_model=512)
+    >>> output, _ = net(x)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_layers,
+        nhead,
+        d_ffn,
+        input_shape=None,
+        d_model=None,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        activation=nn.ReLU,
+        normalize_before=False,
+        causal=False,
+        layerdrop_prob=0.0,
+        attention_type="regularMHA",
+        ffn_type="regularFFN",
+        ffn_cnn_kernel_size_list=[3, 3],
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    d_ffn=d_ffn,
+                    nhead=nhead,
+                    d_model=d_model,
+                    kdim=kdim,
+                    vdim=vdim,
+                    dropout=dropout,
+                    activation=activation,
+                    normalize_before=normalize_before,
+                    causal=causal,
+                    attention_type=attention_type,
+                    ffn_type=ffn_type,
+                    ffn_cnn_kernel_size_list=ffn_cnn_kernel_size_list,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.layerdrop_prob = layerdrop_prob
+        self.rng = np.random.default_rng()
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+        src_key_padding_mask: Optional[torch.Tensor] = None,
+        pos_embs: Optional[torch.Tensor] = None,
+        dynchunktrain_config=None,
+    ):
+        """
+        Arguments
+        ---------
+        src : torch.Tensor
+            The sequence to the encoder layer (required).
+        src_mask : torch.Tensor
+            The mask for the src sequence (optional).
+        src_key_padding_mask : torch.Tensor
+            The mask for the src keys per batch (optional).
+        pos_embs : torch.Tensor
+            The positional embedding tensor
+        dynchunktrain_config : config
+            Not supported for this encoder.
+        Returns
+        -------
+        output : torch.Tensor
+            The output of the transformer.
+        attention_lst : list
+            The attention values.
+        """
+        assert (
+            dynchunktrain_config is None
+        ), "Dynamic Chunk Training unsupported for this encoder"
+        output = src
+        if self.layerdrop_prob > 0.0:
+            keep_probs = self.rng.random(len(self.layers))
+        else:
+            keep_probs = None
+        attention_lst = []
+        for i, enc_layer in enumerate(self.layers):
+            if (
+                not self.training
+                or self.layerdrop_prob == 0.0
+                or keep_probs[i] > self.layerdrop_prob
+            ):
+                output, attention = enc_layer(
+                    output,
+                    src_mask=src_mask,
+                    src_key_padding_mask=src_key_padding_mask,
+                    pos_embs=pos_embs,
+                )
+                attention_lst.append(attention)
+        output = self.norm(output)
+        return output, attention_lst
+class TransformerDecoderLayer(nn.Module):
+    """This class implements the self-attention decoder layer.
+    Arguments
+    ---------
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    nhead : int
+        Number of attention heads.
+    d_model : int
+        Dimension of the model.
+    kdim : int
+        Dimension for key (optional).
+    vdim : int
+        Dimension for value (optional).
+    dropout : float
+        Dropout for the decoder (optional).
+    activation : Callable
+        Function to use between layers, default nn.ReLU
+    normalize_before : bool
+        Whether to normalize before layers.
+    attention_type : str
+        Type of attention to use, "regularMHA" or "RelPosMHAXL"
+    causal : bool
+        Whether to mask future positions.
+    Example
+    -------
+    >>> src = torch.rand((8, 60, 512))
+    >>> tgt = torch.rand((8, 60, 512))
+    >>> net = TransformerDecoderLayer(1024, 8, d_model=512)
+    >>> output, self_attn, multihead_attn = net(src, tgt)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        d_ffn,
+        nhead,
+        d_model,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        activation=nn.ReLU,
+        normalize_before=False,
+        attention_type="regularMHA",
+        causal=None,
+    ):
+        super().__init__()
+        self.nhead = nhead
+        if attention_type == "regularMHA":
+            self.self_attn = sb.nnet.attention.MultiheadAttention(
+                nhead=nhead,
+                d_model=d_model,
+                kdim=kdim,
+                vdim=vdim,
+                dropout=dropout,
+            )
+            self.multihead_attn = sb.nnet.attention.MultiheadAttention(
+                nhead=nhead,
+                d_model=d_model,
+                kdim=kdim,
+                vdim=vdim,
+                dropout=dropout,
+            )
+        elif attention_type == "RelPosMHAXL":
+            self.self_attn = sb.nnet.attention.RelPosMHAXL(
+                d_model, nhead, dropout, mask_pos_future=causal
+            )
+            self.multihead_attn = sb.nnet.attention.RelPosMHAXL(
+                d_model, nhead, dropout, mask_pos_future=causal
+            )
+        self.pos_ffn = sb.nnet.attention.PositionalwiseFeedForward(
+            d_ffn=d_ffn,
+            input_size=d_model,
+            dropout=dropout,
+            activation=activation,
+        )
+        # normalization layers
+        self.norm1 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.norm2 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.norm3 = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+        self.dropout1 = torch.nn.Dropout(dropout)
+        self.dropout2 = torch.nn.Dropout(dropout)
+        self.dropout3 = torch.nn.Dropout(dropout)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ----------
+        tgt: torch.Tensor
+            The sequence to the decoder layer (required).
+        memory: torch.Tensor
+            The sequence from the last layer of the encoder (required).
+        tgt_mask: torch.Tensor
+            The mask for the tgt sequence (optional).
+        memory_mask: torch.Tensor
+            The mask for the memory sequence (optional).
+        tgt_key_padding_mask: torch.Tensor
+            The mask for the tgt keys per batch (optional).
+        memory_key_padding_mask: torch.Tensor
+            The mask for the memory keys per batch (optional).
+        pos_embs_tgt: torch.Tensor
+            The positional embeddings for the target (optional).
+        pos_embs_src: torch.Tensor
+            The positional embeddings for the source (optional).
+        """
+        if self.normalize_before:
+            tgt1 = self.norm1(tgt)
+        else:
+            tgt1 = tgt
+        # self-attention over the target sequence
+        tgt2, self_attn = self.self_attn(
+            query=tgt1,
+            key=tgt1,
+            value=tgt1,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask,
+            pos_embs=pos_embs_tgt,
+        )
+        # add & norm
+        tgt = tgt + self.dropout1(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+        if self.normalize_before:
+            tgt1 = self.norm2(tgt)
+        else:
+            tgt1 = tgt
+        # multi-head attention over the target sequence and encoder states
+        tgt2, multihead_attention = self.multihead_attn(
+            query=tgt1,
+            key=memory,
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+            pos_embs=pos_embs_src,
+        )
+        # add & norm
+        tgt = tgt + self.dropout2(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+        if self.normalize_before:
+            tgt1 = self.norm3(tgt)
+        else:
+            tgt1 = tgt
+        tgt2 = self.pos_ffn(tgt1)
+        # add & norm
+        tgt = tgt + self.dropout3(tgt2)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt, self_attn, multihead_attention
+class TransformerDecoder(nn.Module):
+    """This class implements the Transformer decoder.
+    Arguments
+    ---------
+    num_layers : int
+        Number of transformer layers for the decoder.
+    nhead : int
+        Number of attention heads.
+    d_ffn : int
+        Hidden size of self-attention Feed Forward layer.
+    d_model : int
+        Dimension of the model.
+    kdim : int, optional
+        Dimension for key (Optional).
+    vdim : int, optional
+        Dimension for value (Optional).
+    dropout : float, optional
+        Dropout for the decoder (Optional).
+    activation : Callable
+        The function to apply between layers, default nn.ReLU
+    normalize_before : bool
+        Whether to normalize before layers.
+    causal : bool
+        Whether to allow future information in decoding.
+    attention_type : str
+        Type of attention to use, "regularMHA" or "RelPosMHAXL"
+    Example
+    -------
+    >>> src = torch.rand((8, 60, 512))
+    >>> tgt = torch.rand((8, 60, 512))
+    >>> net = TransformerDecoder(1, 8, 1024, d_model=512)
+    >>> output, _, _ = net(src, tgt)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_layers,
+        nhead,
+        d_ffn,
+        d_model,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        activation=nn.ReLU,
+        normalize_before=False,
+        causal=False,
+        attention_type="regularMHA",
+    ):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(
+            [
+                TransformerDecoderLayer(
+                    d_ffn=d_ffn,
+                    nhead=nhead,
+                    d_model=d_model,
+                    kdim=kdim,
+                    vdim=vdim,
+                    dropout=dropout,
+                    activation=activation,
+                    normalize_before=normalize_before,
+                    causal=causal,
+                    attention_type=attention_type,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm = sb.nnet.normalization.LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos_embs_tgt=None,
+        pos_embs_src=None,
+    ):
+        """
+        Arguments
+        ----------
+        tgt : torch.Tensor
+            The sequence to the decoder layer (required).
+        memory : torch.Tensor
+            The sequence from the last layer of the encoder (required).
+        tgt_mask : torch.Tensor
+            The mask for the tgt sequence (optional).
+        memory_mask : torch.Tensor
+            The mask for the memory sequence (optional).
+        tgt_key_padding_mask : torch.Tensor
+            The mask for the tgt keys per batch (optional).
+        memory_key_padding_mask : torch.Tensor
+            The mask for the memory keys per batch (optional).
+        pos_embs_tgt : torch.Tensor
+            The positional embeddings for the target (optional).
+        pos_embs_src : torch.Tensor
+            The positional embeddings for the source (optional).
+        """
+        output = tgt
+        self_attns, multihead_attns = [], []
+        for dec_layer in self.layers:
+            output, self_attn, multihead_attn = dec_layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos_embs_tgt=pos_embs_tgt,
+                pos_embs_src=pos_embs_src,
+            )
+            self_attns.append(self_attn)
+            multihead_attns.append(multihead_attn)
+        output = self.norm(output)
+        return output, self_attns, multihead_attns
+class NormalizedEmbedding(nn.Module):
+    """This class implements the normalized embedding layer for the transformer.
+    Since the dot product of the self-attention is always normalized by sqrt(d_model)
+    and the final linear projection for prediction shares weight with the embedding layer,
+    we multiply the output of the embedding by sqrt(d_model).
+    Arguments
+    ---------
+    d_model: int
+        The number of expected features in the encoder/decoder inputs (default=512).
+    vocab: int
+        The vocab size.
+    Example
+    -------
+    >>> emb = NormalizedEmbedding(512, 1000)
+    >>> trg = torch.randint(0, 999, (8, 50))
+    >>> emb_fea = emb(trg)
+    """
+    def __init__(self, d_model, vocab):
+        super().__init__()
+        self.emb = sb.nnet.embedding.Embedding(
+            num_embeddings=vocab, embedding_dim=d_model, blank_id=0
+        )
+        self.d_model = d_model
+    def forward(self, x):
+        """Processes the input tensor x and returns an output tensor."""
+        return self.emb(x) * math.sqrt(self.d_model)
+def get_key_padding_mask(padded_input, pad_idx):
+    """Creates a binary mask to prevent attention to padded locations.
+    We suggest using ``get_mask_from_lengths`` instead of this function.
+    Arguments
+    ---------
+    padded_input: torch.Tensor
+        Padded input.
+    pad_idx: int
+        idx for padding element.
+    Returns
+    -------
+    key_padded_mask: torch.Tensor
+        Binary mask to prevent attention to padding.
+    Example
+    -------
+    >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
+    >>> get_key_padding_mask(a, pad_idx=0)
+    tensor([[False, False,  True],
+            [False, False,  True],
+            [False, False,  True]])
+    """
+    if len(padded_input.shape) == 4:
+        bz, time, ch1, ch2 = padded_input.shape
+        padded_input = padded_input.reshape(bz, time, ch1 * ch2)
+    key_padded_mask = padded_input.eq(pad_idx).to(padded_input.device)
+    # if the input is more than 2d, mask the locations where they are silence
+    # across all channels
+    if len(padded_input.shape) > 2:
+        key_padded_mask = key_padded_mask.float().prod(dim=-1).bool()
+        return key_padded_mask.detach()
+    return key_padded_mask.detach()
+def get_lookahead_mask(padded_input):
+    """Creates a binary mask for each sequence which masks future frames.
+    Arguments
+    ---------
+    padded_input: torch.Tensor
+        Padded input tensor.
+    Returns
+    -------
+    mask : torch.Tensor
+        Binary mask for masking future frames.
+    Example
+    -------
+    >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
+    >>> get_lookahead_mask(a)
+    tensor([[0., -inf, -inf],
+            [0., 0., -inf],
+            [0., 0., 0.]])
+    """
+    seq_len = padded_input.shape[1]
+    mask = (
+        torch.triu(torch.ones((seq_len, seq_len), device=padded_input.device))
+        == 1
+    ).transpose(0, 1)
+    mask = (
+        mask.float()
+        .masked_fill(mask == 0, float("-inf"))
+        .masked_fill(mask == 1, float(0.0))
+    )
+    return mask.detach().to(padded_input.device)
+def get_mask_from_lengths(lengths, max_len=None):
+    """Creates a binary mask from sequence lengths
+    Arguments
+    ---------
+    lengths: torch.Tensor
+        A tensor of sequence lengths
+    max_len: int (Optional)
+        Maximum sequence length, defaults to None.
+    Returns
+    -------
+    mask: torch.Tensor
+        the mask where padded elements are set to True.
+        Then one can use tensor.masked_fill_(mask, 0) for the masking.
+    Example
+    -------
+    >>> lengths = torch.tensor([3, 2, 4])
+    >>> get_mask_from_lengths(lengths)
+    tensor([[False, False, False,  True],
+            [False, False,  True,  True],
+            [False, False, False, False]])
+    """
+    if max_len is None:
+        max_len = torch.max(lengths).item()
+    seq_range = torch.arange(
+        max_len, device=lengths.device, dtype=lengths.dtype
+    )
+    return ~(seq_range.unsqueeze(0) < lengths.unsqueeze(1))

model/modules/TransformerASR.py ADDED Viewed

	@@ -0,0 +1,682 @@

+"""Added ConMamba and Mamba
+Authors
+* Xilin Jiang 2024
+"""
+"""Transformer for ASR in the SpeechBrain style.
+Authors
+* Jianyuan Zhong 2020
+* Titouan Parcollet 2024
+* Luca Della Libera 2024
+"""
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch  # noqa 42
+from torch import nn
+from speechbrain.dataio.dataio import length_to_mask
+from modules.Transformer import (
+    NormalizedEmbedding,
+    TransformerInterface,
+    get_key_padding_mask,
+    get_lookahead_mask,
+)
+from speechbrain.nnet.activations import Swish
+from speechbrain.nnet.containers import ModuleList
+from speechbrain.nnet.linear import Linear
+from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+@dataclass
+class TransformerASRStreamingContext:
+    """Streaming metadata and state for a `TransformerASR` instance."""
+    dynchunktrain_config: DynChunkTrainConfig
+    """Dynamic Chunk Training configuration holding chunk size and context size
+    information."""
+    encoder_context: Any
+    """Opaque encoder context information. It is constructed by the encoder's
+    `make_streaming_context` method and is passed to the encoder when using
+    `encode_streaming`.
+    """
+def make_transformer_src_mask(
+    src: torch.Tensor,
+    causal: bool = False,
+    dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+) -> Optional[torch.Tensor]:
+    """Prepare the source transformer mask that restricts which frames can
+    attend to which frames depending on causal or other simple restricted
+    attention methods.
+    Arguments
+    ---------
+    src: torch.Tensor
+        The source tensor to build a mask from. The contents of the tensor are
+        not actually used currently; only its shape and other metadata (e.g.
+        device).
+    causal: bool
+        Whether strict causality shall be used. Frames will not be able to
+        attend to any future frame.
+    dynchunktrain_config: DynChunkTrainConfig, optional
+        Dynamic Chunk Training configuration. This implements a simple form of
+        chunkwise attention. Incompatible with `causal`.
+    Returns
+    -------
+    torch.Tensor
+        A boolean mask Tensor of shape (timesteps, timesteps).
+    """
+    if causal:
+        assert dynchunktrain_config is None
+        return get_lookahead_mask(src)
+    if dynchunktrain_config is None:
+        return
+    # The following is not really the sole source used to implement this,
+    # but it helps introduce the concept.
+    # ref: Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
+    # https://arxiv.org/pdf/2012.05481.pdf
+    timesteps = src.size(1)
+    # Mask the future at the right of each chunk
+    chunk_size = dynchunktrain_config.chunk_size
+    num_chunks = timesteps // chunk_size
+    timestep_idx = torch.arange(timesteps, device=src.device)
+    mask_idx = torch.arange(
+        chunk_size, chunk_size * (num_chunks + 2), chunk_size, device=src.device
+    ).repeat_interleave(chunk_size)[:timesteps]
+    src_mask = timestep_idx[None] >= mask_idx[:, None]
+    # Mask the past at the left of each chunk (accounting for left context)
+    # only relevant if using left context
+    if not dynchunktrain_config.is_infinite_left_context():
+        num_left_chunks = dynchunktrain_config.left_context_size
+        mask_idx -= chunk_size * (num_left_chunks + 1)
+        src_mask += timestep_idx[None] < mask_idx[:, None]
+    return src_mask
+def make_transformer_src_tgt_masks(
+    src,
+    tgt=None,
+    wav_len=None,
+    pad_idx=0,
+    causal: bool = False,
+    dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+):
+    """This function generates masks for training the transformer model,
+    opinionated for an ASR context with encoding masks and, optionally, decoding
+    masks (if specifying `tgt`).
+    Arguments
+    ---------
+    src : torch.Tensor
+        The sequence to the encoder (required).
+    tgt : torch.Tensor
+        The sequence to the decoder.
+    wav_len : torch.Tensor
+        The lengths of the inputs.
+    pad_idx : int
+        The index for <pad> token (default=0).
+    causal: bool
+        Whether strict causality shall be used. See `make_asr_src_mask`
+    dynchunktrain_config: DynChunkTrainConfig, optional
+        Dynamic Chunk Training configuration. See `make_asr_src_mask`
+    Returns
+    -------
+    src_key_padding_mask : torch.Tensor
+        Key padding mask for ignoring padding
+    tgt_key_padding_mask : torch.Tensor
+        Key padding mask for ignoring padding
+    src_mask : torch.Tensor
+        Mask for ignoring invalid (e.g. future) timesteps
+    tgt_mask : torch.Tensor
+        Mask for ignoring invalid (e.g. future) timesteps
+    """
+    src_key_padding_mask = None
+    # mask out audio beyond the length of audio for each batch
+    if wav_len is not None:
+        abs_len = torch.round(wav_len * src.shape[1])
+        src_key_padding_mask = ~length_to_mask(abs_len).bool()
+    # mask out the source
+    src_mask = make_transformer_src_mask(
+        src, causal=causal, dynchunktrain_config=dynchunktrain_config
+    )
+    # If no decoder in the transformer...
+    if tgt is not None:
+        tgt_key_padding_mask = get_key_padding_mask(tgt, pad_idx=pad_idx)
+        tgt_mask = get_lookahead_mask(tgt)
+    else:
+        tgt_key_padding_mask = None
+        tgt_mask = None
+    return src_key_padding_mask, tgt_key_padding_mask, src_mask, tgt_mask
+class TransformerASR(TransformerInterface):
+    """This is an implementation of transformer model for ASR.
+    The architecture is based on the paper "Attention Is All You Need":
+    https://arxiv.org/pdf/1706.03762.pdf
+    Arguments
+    ---------
+    tgt_vocab: int
+        Size of vocabulary.
+    input_size: int
+        Input feature size.
+    d_model : int, optional
+        Embedding dimension size.
+        (default=512).
+    nhead : int, optional
+        The number of heads in the multi-head attention models (default=8).
+    num_encoder_layers : int, optional
+        The number of sub-encoder-layers in the encoder (default=6).
+    num_decoder_layers : int, optional
+        The number of sub-decoder-layers in the decoder (default=6).
+    d_ffn : int, optional
+        The dimension of the feedforward network model (default=2048).
+    dropout : int, optional
+        The dropout value (default=0.1).
+    activation : torch.nn.Module, optional
+        The activation function of FFN layers.
+        Recommended: relu or gelu (default=relu).
+    positional_encoding: str, optional
+        Type of positional encoding used. e.g. 'fixed_abs_sine' for fixed absolute positional encodings.
+    normalize_before: bool, optional
+        Whether normalization should be applied before or after MHA or FFN in Transformer layers.
+        Defaults to True as this was shown to lead to better performance and training stability.
+    kernel_size: int, optional
+        Kernel size in convolutional layers when Conformer is used.
+    bias: bool, optional
+        Whether to use bias in Conformer convolutional layers.
+    encoder_module: str, optional
+        Choose between Branchformer, Conformer, ConMamba, and Transformer for the encoder.
+    decoder_module: str, optional
+        Choose between Mamba and Transformer for the decoder.
+    decoder_module: str, optional
+        Choose between Transformer and Mamba for the decoder.
+    conformer_activation: torch.nn.Module, optional
+        Activation module used after Conformer convolutional layers. E.g. Swish, ReLU etc. it has to be a torch Module.
+    branchformer_activation: torch.nn.Module, optional
+        Activation module used within the Branchformer Encoder. E.g. Swish, ReLU etc. it has to be a torch Module.
+    attention_type: str, optional
+        Type of attention layer used in all Transformer or Conformer layers.
+        e.g. regularMHA or RelPosMHA.
+    max_length: int, optional
+        Max length for the target and source sequence in input.
+        Used for positional encodings.
+    causal: bool, optional
+        Whether the encoder should be causal or not (the decoder is always causal).
+        If causal the Conformer convolutional layer is causal.
+    csgu_linear_units: int, optional
+        Number of neurons in the hidden linear units of the CSGU Module.
+        -> Branchformer
+    gate_activation: torch.nn.Module, optional
+        Activation function used at the gate of the CSGU module.
+        -> Branchformer
+    use_linear_after_conv: bool, optional
+        If True, will apply a linear transformation of size input_size//2.
+        -> Branchformer
+    mamba_config: dict, optional
+        Mamba parameters if encoder_module or decoder_module is Mamba or ConMamba
+    Example
+    -------
+    >>> src = torch.rand([8, 120, 512])
+    >>> tgt = torch.randint(0, 720, [8, 120])
+    >>> net = TransformerASR(
+    ...     720, 512, 512, 8, 1, 1, 1024, activation=torch.nn.GELU
+    ... )
+    >>> enc_out, dec_out = net.forward(src, tgt)
+    >>> enc_out.shape
+    torch.Size([8, 120, 512])
+    >>> dec_out.shape
+    torch.Size([8, 120, 512])
+    """
+    def __init__(
+        self,
+        tgt_vocab,
+        input_size,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        d_ffn=2048,
+        dropout=0.1,
+        activation=nn.ReLU,
+        positional_encoding="fixed_abs_sine",
+        normalize_before=False,
+        kernel_size: Optional[int] = 31,
+        bias: Optional[bool] = True,
+        encoder_module: Optional[str] = "transformer",
+        decoder_module: Optional[str] = "transformer",
+        conformer_activation: Optional[nn.Module] = Swish,
+        branchformer_activation: Optional[nn.Module] = nn.GELU,
+        attention_type: Optional[str] = "regularMHA",
+        max_length: Optional[int] = 2500,
+        causal: Optional[bool] = True,
+        csgu_linear_units: Optional[int] = 3072,
+        gate_activation: Optional[nn.Module] = nn.Identity,
+        use_linear_after_conv: Optional[bool] = False,
+        mamba_config=None
+    ):
+        super().__init__(
+            d_model=d_model,
+            nhead=nhead,
+            num_encoder_layers=num_encoder_layers,
+            num_decoder_layers=num_decoder_layers,
+            d_ffn=d_ffn,
+            dropout=dropout,
+            activation=activation,
+            positional_encoding=positional_encoding,
+            normalize_before=normalize_before,
+            kernel_size=kernel_size,
+            bias=bias,
+            encoder_module=encoder_module,
+            decoder_module=decoder_module,
+            conformer_activation=conformer_activation,
+            branchformer_activation=branchformer_activation,
+            attention_type=attention_type,
+            max_length=max_length,
+            causal=causal,
+            csgu_linear_units=csgu_linear_units,
+            gate_activation=gate_activation,
+            use_linear_after_conv=use_linear_after_conv,
+            mamba_config=mamba_config
+        )
+        self.custom_src_module = ModuleList(
+            Linear(
+                input_size=input_size,
+                n_neurons=d_model,
+                bias=True,
+                combine_dims=False,
+            ),
+            torch.nn.Dropout(dropout),
+        )
+        self.num_decoder_layers = num_decoder_layers
+        if num_decoder_layers > 0:
+            self.custom_tgt_module = ModuleList(
+                NormalizedEmbedding(d_model, tgt_vocab)
+            )
+        # reset parameters using xavier_normal_
+        self._init_params()
+    def forward(self, src, tgt, wav_len=None, pad_idx=0):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            The sequence to the encoder.
+        tgt : torch.Tensor
+            The sequence to the decoder.
+        wav_len: torch.Tensor, optional
+            Torch Tensor of shape (batch, ) containing the relative length to padded length for each example.
+        pad_idx : int, optional
+            The index for <pad> token (default=0).
+        """
+        # reshape the src vector to [Batch, Time, Fea] is a 4d vector is given
+        if src.ndim == 4:
+            bz, t, ch1, ch2 = src.shape
+            src = src.reshape(bz, t, ch1 * ch2)
+        (
+            src_key_padding_mask,
+            tgt_key_padding_mask,
+            src_mask,
+            tgt_mask,
+        ) = make_transformer_src_tgt_masks(
+            src, tgt, wav_len, causal=self.causal, pad_idx=pad_idx
+        )
+        src = self.custom_src_module(src)
+        # add pos encoding to queries if are sinusoidal ones else
+        if self.attention_type == "hypermixing":
+            pos_embs_encoder = None
+        elif self.attention_type == "RelPosMHAXL":
+            pos_embs_encoder = self.positional_encoding(src)
+        elif self.positional_encoding_type == "fixed_abs_sine":
+            src = src + self.positional_encoding(src)  # add the encodings here
+            pos_embs_encoder = None
+        encoder_out, _ = self.encoder(
+            src=src,
+            src_mask=src_mask,
+            src_key_padding_mask=src_key_padding_mask,
+            pos_embs=pos_embs_encoder,
+        )
+        if self.num_decoder_layers > 0:
+            tgt = self.custom_tgt_module(tgt)
+            if self.attention_type == "RelPosMHAXL":
+                tgt = tgt + self.positional_encoding_decoder(tgt)
+                pos_embs_encoder = None  # self.positional_encoding(src)
+                pos_embs_target = None
+            elif (
+                self.positional_encoding_type == "fixed_abs_sine"
+                or self.attention_type == "hypermixing"
+            ):
+                tgt = tgt + self.positional_encoding(tgt)
+                pos_embs_target = None
+                pos_embs_encoder = None
+            decoder_out, _, _ = self.decoder(
+                tgt=tgt,
+                memory=encoder_out,
+                memory_mask=None,
+                tgt_mask=tgt_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=src_key_padding_mask,
+                pos_embs_tgt=pos_embs_target,
+                pos_embs_src=pos_embs_encoder,
+            )
+        else:
+            decoder_out = None
+        return encoder_out, decoder_out
+    @torch.no_grad()
+    def decode(self, tgt, encoder_out, enc_len=None):
+        """This method implements a decoding step for the transformer model.
+        Arguments
+        ---------
+        tgt : torch.Tensor
+            The sequence to the decoder.
+        encoder_out : torch.Tensor
+            Hidden output of the encoder.
+        enc_len : torch.LongTensor
+            The actual length of encoder states.
+        Returns
+        -------
+        prediction
+        """
+        tgt_mask = get_lookahead_mask(tgt)
+        src_key_padding_mask = None
+        if enc_len is not None:
+            src_key_padding_mask = (1 - length_to_mask(enc_len)).bool()
+        if self.num_decoder_layers > 0:
+            tgt = self.custom_tgt_module(tgt)
+        if self.attention_type == "RelPosMHAXL":
+            tgt = tgt + self.positional_encoding_decoder(tgt)
+            pos_embs_encoder = None  # self.positional_encoding(src)
+            pos_embs_target = None
+        elif (
+            self.positional_encoding_type == "fixed_abs_sine"
+            or self.attention_type == "hypermixing"
+        ):
+            tgt = tgt + self.positional_encoding(tgt)  # add the encodings here
+            pos_embs_target = None
+            pos_embs_encoder = None
+        prediction, self_attns, multihead_attns = self.decoder(
+            tgt,
+            encoder_out,
+            tgt_mask=tgt_mask,
+            memory_key_padding_mask=src_key_padding_mask,
+            pos_embs_tgt=pos_embs_target,
+            pos_embs_src=pos_embs_encoder,
+        )
+        return prediction, multihead_attns[-1]
+    def encode(
+        self,
+        src,
+        wav_len=None,
+        pad_idx=0,
+        dynchunktrain_config: Optional[DynChunkTrainConfig] = None,
+    ):
+        """
+        Encoder forward pass
+        Arguments
+        ---------
+        src : torch.Tensor
+            The sequence to the encoder.
+        wav_len : torch.Tensor, optional
+            Torch Tensor of shape (batch, ) containing the relative length to padded length for each example.
+        pad_idx : int
+            The index used for padding.
+        dynchunktrain_config : DynChunkTrainConfig
+            Dynamic chunking config.
+        Returns
+        -------
+        encoder_out : torch.Tensor
+        """
+        # reshape the src vector to [Batch, Time, Fea] if a 4d vector is given
+        if src.dim() == 4:
+            bz, t, ch1, ch2 = src.shape
+            src = src.reshape(bz, t, ch1 * ch2)
+        (
+            src_key_padding_mask,
+            _,
+            src_mask,
+            _,
+        ) = make_transformer_src_tgt_masks(
+            src,
+            None,
+            wav_len,
+            pad_idx=pad_idx,
+            causal=self.causal,
+            dynchunktrain_config=dynchunktrain_config,
+        )
+        src = self.custom_src_module(src)
+        if self.attention_type == "hypermixing":
+            pos_embs_source = None
+        elif self.attention_type == "RelPosMHAXL":
+            pos_embs_source = self.positional_encoding(src)
+        elif self.positional_encoding_type == "fixed_abs_sine":
+            src = src + self.positional_encoding(src)
+            pos_embs_source = None
+        encoder_out, _ = self.encoder(
+            src=src,
+            src_mask=src_mask,
+            src_key_padding_mask=src_key_padding_mask,
+            pos_embs=pos_embs_source,
+            dynchunktrain_config=dynchunktrain_config,
+        )
+        return encoder_out
+    def encode_streaming(self, src, context: TransformerASRStreamingContext):
+        """
+        Streaming encoder forward pass
+        Arguments
+        ---------
+        src : torch.Tensor
+            The sequence (chunk) to the encoder.
+        context : TransformerASRStreamingContext
+            Mutable reference to the streaming context. This holds the state
+            needed to persist across chunk inferences and can be built using
+            `make_streaming_context`. This will get mutated by this function.
+        Returns
+        -------
+        Encoder output for this chunk.
+        Example
+        -------
+        >>> import torch
+        >>> from speechbrain.lobes.models.transformer.TransformerASR import TransformerASR
+        >>> from speechbrain.utils.dynamic_chunk_training import DynChunkTrainConfig
+        >>> net = TransformerASR(
+        ...     tgt_vocab=100,
+        ...     input_size=64,
+        ...     d_model=64,
+        ...     nhead=8,
+        ...     num_encoder_layers=1,
+        ...     num_decoder_layers=0,
+        ...     d_ffn=128,
+        ...     attention_type="RelPosMHAXL",
+        ...     positional_encoding=None,
+        ...     encoder_module="conformer",
+        ...     normalize_before=True,
+        ...     causal=False,
+        ... )
+        >>> ctx = net.make_streaming_context(DynChunkTrainConfig(16, 1))
+        >>> src1 = torch.rand([8, 16, 64])
+        >>> src2 = torch.rand([8, 16, 64])
+        >>> out1 = net.encode_streaming(src1, ctx)
+        >>> out1.shape
+        torch.Size([8, 16, 64])
+        >>> ctx.encoder_context.layers[0].mha_left_context.shape
+        torch.Size([8, 16, 64])
+        >>> out2 = net.encode_streaming(src2, ctx)
+        >>> out2.shape
+        torch.Size([8, 16, 64])
+        >>> ctx.encoder_context.layers[0].mha_left_context.shape
+        torch.Size([8, 16, 64])
+        >>> combined_out = torch.concat((out1, out2), dim=1)
+        >>> combined_out.shape
+        torch.Size([8, 32, 64])
+        """
+        if src.dim() == 4:
+            bz, t, ch1, ch2 = src.shape
+            src = src.reshape(bz, t, ch1 * ch2)
+        # HACK: our problem here is that the positional_encoding is computed
+        # against the size of our source tensor, but we only know how many left
+        # context frames we're injecting to the encoder within the encoder
+        # context.
+        # so this workaround does just that.
+        #
+        # i'm not sure how this would be best refactored, but an option would be
+        # to let the encoder get the pos embedding itself and have a way to
+        # cache it.
+        #
+        # additionally, positional encoding functions take in a whole source
+        # tensor just to get its attributes (size, device, type) but this is
+        # sort of silly for the embeddings that don't need one.
+        # so we craft a dummy empty (uninitialized) tensor to help...
+        known_left_context = context.encoder_context.layers[0].mha_left_context
+        if known_left_context is None:
+            pos_encoding_dummy = src
+        else:
+            target_shape = list(src.shape)
+            target_shape[-2] += known_left_context.shape[-2]
+            pos_encoding_dummy = torch.empty(size=target_shape).to(src)
+        src = self.custom_src_module(src)
+        if self.attention_type == "RelPosMHAXL":
+            pos_embs_source = self.positional_encoding(pos_encoding_dummy)
+        elif self.positional_encoding_type == "fixed_abs_sine":
+            src = src + self.positional_encoding(pos_encoding_dummy)
+            pos_embs_source = None
+        encoder_out, _ = self.encoder.forward_streaming(
+            src=src, pos_embs=pos_embs_source, context=context.encoder_context
+        )
+        return encoder_out
+    def make_streaming_context(
+        self, dynchunktrain_config: DynChunkTrainConfig, encoder_kwargs={}
+    ):
+        """Creates a blank streaming context for this transformer and its
+        encoder.
+        Arguments
+        ---------
+        dynchunktrain_config : DynChunkTrainConfig
+            Runtime chunkwise attention configuration.
+        encoder_kwargs : dict
+            Parameters to be forward to the encoder's `make_streaming_context`.
+            Metadata required for the encoder could differ depending on the
+            encoder.
+        Returns
+        -------
+        TransformerASRStreamingContext
+        """
+        return TransformerASRStreamingContext(
+            dynchunktrain_config=dynchunktrain_config,
+            encoder_context=self.encoder.make_streaming_context(
+                dynchunktrain_config,
+                **encoder_kwargs,
+            ),
+        )
+    def _init_params(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                torch.nn.init.xavier_normal_(p)
+class EncoderWrapper(nn.Module):
+    """This is a wrapper of any ASR transformer encoder. By default, the
+    TransformerASR .forward() function encodes and decodes. With this wrapper
+    the .forward() function becomes .encode() only.
+    Important: The TransformerASR class must contain a .encode() function.
+    Arguments
+    ---------
+    transformer : sb.lobes.models.TransformerInterface
+        A Transformer instance that contains a .encode() function.
+    *args : tuple
+    **kwargs : dict
+        Arguments to forward to parent class.
+    Example
+    -------
+    >>> src = torch.rand([8, 120, 512])
+    >>> tgt = torch.randint(0, 720, [8, 120])
+    >>> net = TransformerASR(
+    ...     720, 512, 512, 8, 1, 1, 1024, activation=torch.nn.GELU
+    ... )
+    >>> encoder = EncoderWrapper(net)
+    >>> enc_out = encoder(src)
+    >>> enc_out.shape
+    torch.Size([8, 120, 512])
+    """
+    def __init__(self, transformer, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.transformer = transformer
+        self.make_streaming_context = self.transformer.make_streaming_context
+    def forward(self, x, wav_lens=None, pad_idx=0, **kwargs):
+        """Processes the input tensor x and returns an output tensor."""
+        x = self.transformer.encode(x, wav_lens, pad_idx, **kwargs)
+        return x
+    def forward_streaming(self, x, context):
+        """Processes the input audio chunk tensor `x`, using and updating the
+        mutable encoder `context`"""
+        x = self.transformer.encode_streaming(x, context)
+        return x
+    def make_streaming_context(self, *args, **kwargs):
+        """Initializes a streaming context. Forwards all arguments to the
+        underlying transformer. See :meth:`speechbrain.lobes.models.transformer.TransformerASR.make_streaming_context`.
+        """
+        return self.transformer.make_streaming_context(*args, **kwargs)

model/modules/__init__.py ADDED Viewed

File without changes

model/modules/mamba/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/modules/mamba/__init__.py ADDED Viewed

File without changes

model/modules/mamba/bimamba.py ADDED Viewed

	@@ -0,0 +1,465 @@

+'''
+Copied and modified from
+https://github.com/hustvl/Vim/blob/main/mamba-1p1p1/mamba_ssm/modules/mamba_simple.py
+'''
+# Copyright (c) 2023, Tri Dao, Albert Gu.
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from einops import rearrange, repeat
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None
+try:
+    from .selective_scan_interface import selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj
+except ImportError:
+    selective_scan_fn, mamba_inner_fn, bimamba_inner_fn, mamba_inner_fn_no_out_proj = None, None, None, None, None
+try:
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+except ImportError:
+    selective_state_update = None
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+class Mamba(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        d_state=16,
+        d_conv=4,
+        expand=2,
+        dt_rank="auto",
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init="random",
+        dt_scale=1.0,
+        dt_init_floor=1e-4,
+        conv_bias=True,
+        bias=False,
+        use_fast_path=True,  # Fused kernel options
+        layer_idx=None,
+        device=None,
+        dtype=None,
+        bimamba_type="none",
+        if_devide_out=True, # False
+        init_layer_scale=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.d_model = d_model
+        self.d_state = d_state
+        self.d_conv = d_conv
+        self.expand = expand
+        self.d_inner = int(self.expand * self.d_model)
+        self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank
+        self.use_fast_path = use_fast_path
+        self.layer_idx = layer_idx
+        self.bimamba_type = bimamba_type
+        self.if_devide_out = if_devide_out
+        assert bimamba_type == 'v2'
+        self.init_layer_scale = init_layer_scale
+        if init_layer_scale is not None:
+            self.gamma = nn.Parameter(init_layer_scale * torch.ones((d_model)), requires_grad=True)
+        self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
+        self.conv1d = nn.Conv1d(
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            bias=conv_bias,
+            kernel_size=d_conv,
+            groups=self.d_inner,
+            padding=d_conv - 1,
+            **factory_kwargs,
+        )
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.x_proj = nn.Linear(
+            self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+        )
+        self.dt_proj = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+        # Initialize special dt projection to preserve variance at initialization
+        dt_init_std = self.dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
+            nn.init.constant_(self.dt_proj.weight, dt_init_std)
+        elif dt_init == "random":
+            nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
+        else:
+            raise NotImplementedError
+        # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+        dt = torch.exp(
+            torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        with torch.no_grad():
+            self.dt_proj.bias.copy_(inv_dt)
+        # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit
+        self.dt_proj.bias._no_reinit = True
+        # S4D real initialization
+        A = repeat(
+            torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+            "n -> d n",
+            d=self.d_inner,
+        ).contiguous()
+        A_log = torch.log(A)  # Keep A_log in fp32
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+        self.D._no_weight_decay = True
+        # bidirectional
+        if bimamba_type == "v1":
+            A_b = repeat(
+                torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+                "n -> d n",
+                d=self.d_inner,
+            ).contiguous()
+            A_b_log = torch.log(A_b)  # Keep A_b_log in fp32
+            self.A_b_log = nn.Parameter(A_b_log)
+            self.A_b_log._no_weight_decay = True
+        elif bimamba_type == "v2":
+            A_b = repeat(
+                torch.arange(1, self.d_state + 1, dtype=torch.float32, device=device),
+                "n -> d n",
+                d=self.d_inner,
+            ).contiguous()
+            A_b_log = torch.log(A_b)  # Keep A_b_log in fp32
+            self.A_b_log = nn.Parameter(A_b_log)
+            self.A_b_log._no_weight_decay = True
+            self.conv1d_b = nn.Conv1d(
+                in_channels=self.d_inner,
+                out_channels=self.d_inner,
+                bias=conv_bias,
+                kernel_size=d_conv,
+                groups=self.d_inner,
+                padding=d_conv - 1,
+                **factory_kwargs,
+            )
+            self.x_proj_b = nn.Linear(
+                self.d_inner, self.dt_rank + self.d_state * 2, bias=False, **factory_kwargs
+            )
+            self.dt_proj_b = nn.Linear(self.dt_rank, self.d_inner, bias=True, **factory_kwargs)
+            self.D_b = nn.Parameter(torch.ones(self.d_inner, device=device))  # Keep in fp32
+            self.D_b._no_weight_decay = True
+        self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+    def forward(self, hidden_states, inference_params=None):
+        """
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        batch, seqlen, dim = hidden_states.shape
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out
+        # We do matmul and transpose BLH -> HBL at the same time
+        xz = rearrange(
+            self.in_proj.weight @ rearrange(hidden_states, "b l d -> d (b l)"),
+            "d (b l) -> b d l",
+            l=seqlen,
+        )
+        if self.in_proj.bias is not None:
+            xz = xz + rearrange(self.in_proj.bias.to(dtype=xz.dtype), "d -> d 1")
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # In the backward pass we write dx and dz next to each other to avoid torch.cat
+        if self.use_fast_path and inference_params is None:  # Doesn't support outputting the states
+            if self.bimamba_type == "v1":
+                A_b = -torch.exp(self.A_b_log.float())
+                out = bimamba_inner_fn(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    A,
+                    A_b,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+            elif self.bimamba_type == "v2":
+                A_b = -torch.exp(self.A_b_log.float())
+                out = mamba_inner_fn_no_out_proj(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+                out_b = mamba_inner_fn_no_out_proj(
+                    xz.flip([-1]),
+                    self.conv1d_b.weight,
+                    self.conv1d_b.bias,
+                    self.x_proj_b.weight,
+                    self.dt_proj_b.weight,
+                    A_b,
+                    None,
+                    None,
+                    self.D_b.float(),
+                    delta_bias=self.dt_proj_b.bias.float(),
+                    delta_softplus=True,
+                )
+                if not self.if_devide_out:
+                    out = F.linear(rearrange(out + out_b.flip([-1]), "b d l -> b l d"), self.out_proj.weight, self.out_proj.bias)
+                else:
+                    out = F.linear(rearrange(0.5*out + 0.5*out_b.flip([-1]), "b d l -> b l d"), self.out_proj.weight, self.out_proj.bias)
+            else:
+                out = mamba_inner_fn(
+                    xz,
+                    self.conv1d.weight,
+                    self.conv1d.bias,
+                    self.x_proj.weight,
+                    self.dt_proj.weight,
+                    self.out_proj.weight,
+                    self.out_proj.bias,
+                    A,
+                    None,  # input-dependent B
+                    None,  # input-dependent C
+                    self.D.float(),
+                    delta_bias=self.dt_proj.bias.float(),
+                    delta_softplus=True,
+                )
+        else:
+            x, z = xz.chunk(2, dim=1)
+            # Compute short convolution
+            if conv_state is not None:
+                # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                conv_state.copy_(F.pad(x, (self.d_conv - x.shape[-1], 0)))  # Update state (B D W)
+            if causal_conv1d_fn is None:
+                x = self.act(self.conv1d(x)[..., :seqlen])
+            else:
+                assert self.activation in ["silu", "swish"]
+                x = causal_conv1d_fn(
+                    x=x,
+                    weight=rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                )
+            # We're careful here about the layout, to avoid extra transposes.
+            # We want dt to have d as the slowest moving dimension
+            # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+            x_dbl = self.x_proj(rearrange(x, "b d l -> (b l) d"))  # (bl d)
+            dt, B, C = torch.split(x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+            dt = self.dt_proj.weight @ dt.t()
+            dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+            assert self.activation in ["silu", "swish"]
+            y = selective_scan_fn(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                self.D.float(),
+                z=z,
+                delta_bias=self.dt_proj.bias.float(),
+                delta_softplus=True,
+                return_last_state=ssm_state is not None,
+            )
+            if ssm_state is not None:
+                y, last_state = y
+                ssm_state.copy_(last_state)
+            y = rearrange(y, "b d l -> b l d")
+            out = self.out_proj(y)
+        if self.init_layer_scale is not None:
+                out = out * self.gamma
+        return out
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        assert hidden_states.shape[1] == 1, "Only support decoding with 1 token at a time for now"
+        xz = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        x, z = xz.chunk(2, dim=-1)  # (B D)
+        # Conv step
+        if causal_conv1d_update is None:
+            conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+            conv_state[:, :, -1] = x
+            x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+            if self.conv1d.bias is not None:
+                x = x + self.conv1d.bias
+            x = self.act(x).to(dtype=dtype)
+        else:
+            x = causal_conv1d_update(
+                x,
+                conv_state,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.activation,
+            )
+        x_db = self.x_proj(x)  # (B dt_rank+2*d_state)
+        dt, B, C = torch.split(x_db, [self.dt_rank, self.d_state, self.d_state], dim=-1)
+        # Don't add dt_bias here
+        dt = F.linear(dt, self.dt_proj.weight)  # (B d_inner)
+        A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
+        # SSM step
+        if selective_state_update is None:
+            # Discretize A and B
+            dt = F.softplus(dt + self.dt_proj.bias.to(dtype=dt.dtype))
+            dA = torch.exp(torch.einsum("bd,dn->bdn", dt, A))
+            dB = torch.einsum("bd,bn->bdn", dt, B)
+            ssm_state.copy_(ssm_state * dA + rearrange(x, "b d -> b d 1") * dB)
+            y = torch.einsum("bdn,bn->bd", ssm_state.to(dtype), C)
+            y = y + self.D.to(dtype) * x
+            y = y * self.act(z)  # (B D)
+        else:
+            y = selective_state_update(
+                ssm_state, x, dt, A, B, C, self.D, z=z, dt_bias=self.dt_proj.bias, dt_softplus=True
+            )
+        out = self.out_proj(y)
+        return out.unsqueeze(1), conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_conv, device=device, dtype=conv_dtype
+        )
+        ssm_dtype = self.dt_proj.weight.dtype if dtype is None else dtype
+        # ssm_dtype = torch.float32
+        ssm_state = torch.zeros(
+            batch_size, self.d_model * self.expand, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_conv,
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            )
+            ssm_state = torch.zeros(
+                batch_size,
+                self.d_model * self.expand,
+                self.d_state,
+                device=self.dt_proj.weight.device,
+                dtype=self.dt_proj.weight.dtype,
+                # dtype=torch.float32,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state
+class Block(nn.Module):
+    def __init__(
+        self, dim, mixer_cls, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False
+    ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = mixer_cls(dim)
+        self.norm = norm_cls(dim)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)

model/modules/mamba/mamba_blocks.py ADDED Viewed

	@@ -0,0 +1,252 @@

+'''
+Copied and modified from
+https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
+'''
+import math
+import torch
+import torch.nn as nn
+from functools import partial
+from mamba_ssm import Mamba
+from modules.mamba.bimamba import Mamba as BiMamba
+from modules.mamba.bimamba import Block as PreNormBlock
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+def create_block(
+    d_model,
+    ssm_cls=None,
+    ssm_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=True,
+    layer_idx=None,
+    device=None,
+    dtype=None,
+):
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mixer_cls = partial(ssm_cls, layer_idx=layer_idx, **ssm_cfg, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block = PreNormBlock(
+        d_model,
+        mixer_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+class LnMambaAdd(nn.Module):
+    def __init__(self,
+        d_model,
+        ssm_cls,
+        ssm_cfg,
+        rms_norm=False,
+        layer_idx=None
+    ):
+        super().__init__()
+        if rms_norm:
+            self.norm = RMSNorm(d_model)
+        else:
+            self.norm = nn.LayerNorm(d_model)
+        self.mamba = ssm_cls(d_model=d_model, **ssm_cfg)
+        print(type(self.mamba))
+        print('Created LnMambaAdd.')
+    def forward(self, x, residual=None, inference_params=None):
+        if residual != None:
+            x = x + residual
+        return self.mamba(self.norm(x)), x
+class MambaBlocksSequential(nn.Module):
+    """
+    A wrapper for the Mamba block to replicate it
+    Arguments
+    ---------
+    n_mamba : int
+        Number of Mamba blocks
+    d_model : int
+        Input dimension to Mamba (bottleneck dimension).
+    d_state : int
+        Mamba state dimension
+    expand: int
+        First linear projection d_model -> d_model * expand
+    d_conv: int
+        Kernel size of Mamba conv
+    norm type : str
+        The type of normalization, in ['gLN', 'cLN'].
+    ---------
+    """
+    def __init__(self,
+        n_mamba: int,
+        bidirectional: bool,
+        d_model: int, # bottleneck dimension (B)
+        d_state: int = 16,
+        expand: int = 2,
+        d_conv: int = 4, # kernel_size of 'Conv' in Mamba
+        dt_rank: str="auto",
+        conv_bias: bool = True,
+        bias: bool = False,
+        fused_add_norm: bool = True,
+        rms_norm: bool = False,
+        norm_epsilon: float = 1e-5,
+        initializer_cfg=None,
+        residual_in_fp32=False,
+        use_simple_block=False
+    ):
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.bidirectional = bidirectional
+        # We change the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        self.fused_add_norm = fused_add_norm
+        if self.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        self.use_simple_block = use_simple_block
+        ssm_cfg = {
+            "d_state": d_state,
+            "expand": expand,
+            "d_conv": d_conv,
+            "dt_rank": dt_rank,
+            "conv_bias": conv_bias,
+            "bias": bias
+        }
+        if bidirectional:
+            ssm_cfg["bimamba_type"] = "v2"
+        if use_simple_block:
+            self.layers = nn.Sequential(
+                *[
+                    LnMambaAdd(
+                        d_model=d_model,
+                        ssm_cls=BiMamba if bidirectional else Mamba,
+                        ssm_cfg=ssm_cfg,
+                        rms_norm=rms_norm,
+                        layer_idx=i
+                    )
+                    for i in range(n_mamba)
+                ]
+            )
+        else:
+            self.layers = nn.Sequential(
+                *[
+                    create_block(
+                        d_model=d_model,
+                        ssm_cls=BiMamba if bidirectional else Mamba,
+                        ssm_cfg=ssm_cfg,
+                        norm_epsilon=norm_epsilon,
+                        rms_norm=rms_norm,
+                        residual_in_fp32=residual_in_fp32,
+                        fused_add_norm=fused_add_norm,
+                        layer_idx=i,
+                    )
+                    for i in range(n_mamba)
+                ]
+            )
+        self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)(
+            d_model, eps=norm_epsilon
+        )
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=n_mamba,
+                **(initializer_cfg if initializer_cfg is not None else {}),
+            )
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: block.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+    def forward(self, x, inference_params=None):
+        hidden_states = x
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=inference_params
+            )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            # Set prenorm=False here since we don't need the residual
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+        return hidden_states

model/modules/mamba/selective_scan_interface.py ADDED Viewed

	@@ -0,0 +1,714 @@

+'''
+Copied from
+https://github.com/hustvl/Vim/blob/main/mamba-1p1p1/mamba_ssm/ops/selective_scan_interface.py
+'''
+# Copyright (c) 2023, Tri Dao, Albert Gu.
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_bwd, custom_fwd
+from einops import rearrange, repeat
+from causal_conv1d import causal_conv1d_fn
+import causal_conv1d_cuda
+import selective_scan_cuda
+class SelectiveScanFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                return_last_state=False):
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        if delta.stride(-1) != 1:
+            delta = delta.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if z is not None and z.stride(-1) != 1:
+            z = z.contiguous()
+        if B.dim() == 3:
+            B = rearrange(B, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_B = True
+        if C.dim() == 3:
+            C = rearrange(C, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_C = True
+        out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus)
+        ctx.delta_softplus = delta_softplus
+        ctx.has_z = z is not None
+        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
+        if not ctx.has_z:
+            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
+            return out if not return_last_state else (out, last_state)
+        else:
+            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
+            out_z = rest[0]
+            return out_z if not return_last_state else (out_z, last_state)
+    @staticmethod
+    def backward(ctx, dout, *args):
+        if not ctx.has_z:
+            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
+            z = None
+            out = None
+        else:
+            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        # Here we just pass in None and dz will be allocated in the C++ code.
+        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd(
+            u, delta, A, B, C, D, z, delta_bias, dout, x, out, None, ctx.delta_softplus,
+            False  # option to recompute out_z, not used here
+        )
+        dz = rest[0] if ctx.has_z else None
+        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
+        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
+        return (du, ddelta, dA, dB, dC,
+                dD if D is not None else None,
+                dz,
+                ddelta_bias if delta_bias is not None else None,
+                None,
+                None)
+def selective_scan_fn(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                     return_last_state=False):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
+    not considered in the backward pass.
+    """
+    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
+def selective_scan_ref(u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
+                      return_last_state=False):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    if A.is_complex():
+        if is_variable_B:
+            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
+        if is_variable_C:
+            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
+    else:
+        B = B.float()
+        C = C.float()
+    x = A.new_zeros((batch, dim, dstate))
+    ys = []
+    deltaA = torch.exp(torch.einsum('bdl,dn->bdln', delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum('bdl,dn,bdl->bdln', delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum('bdl,bnl,bdl->bdln', delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum('bdl,bdnl,bdl->bdln', delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum('bdn,dn->bd', x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum('bdn,bn->bd', x, C[:, :, i])
+            else:
+                y = torch.einsum('bdn,bdn->bd', x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        if y.is_complex():
+            y = y.real * 2
+        ys.append(y)
+    y = torch.stack(ys, dim=2) # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+class MambaInnerFnNoOutProj(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, conv1d_out, delta,
+                              A, B, C, D, delta_bias, scan_intermediates, out)
+        # return rearrange(out_z, "b d l -> b l d")
+        return out_z
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight,
+         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, out) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        # dout_y = rearrange(dout, "b l d -> b d l") # because no arrange at end of forward, so dout shape is b d l
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout, scan_intermediates, out, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dA, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+class MambaInnerFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                out_proj_weight, out_proj_bias,
+                A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
+                             if out_proj_bias is not None else None)
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
+                              A, B, C, D, delta_bias, scan_intermediates, out)
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
+         conv1d_out, delta, A, B, C, D, delta_bias, scan_intermediates, out) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout_y, scan_intermediates, out, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dout_proj_weight, dout_proj_bias,
+                dA, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+class BiMambaInnerFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                out_proj_weight, out_proj_bias,
+                A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+                C_proj_bias=None, delta_softplus=True, checkpoint_lvl=1):
+        """
+             xz: (batch, dim, seqlen)
+        """
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype())
+                             if out_proj_bias is not None else None)
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l = L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        out_f, scan_intermediates_f, out_z_f = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        assert not A_b.is_complex(), "A should not be complex!!"
+        out_b, scan_intermediates_b, out_z_b = selective_scan_cuda.fwd(
+            conv1d_out.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, delta_softplus,
+        )
+        out_z = out_z_f + out_z_b.flip([-1])
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight,
+                              delta_proj_weight, out_proj_weight, conv1d_out, delta,
+                              A, A_b, B, C, D, delta_bias, scan_intermediates_f, scan_intermediates_b, out_f, out_b)
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        (xz, conv1d_weight, conv1d_bias, x_dbl, x_proj_weight, delta_proj_weight, out_proj_weight,
+         conv1d_out, delta, A, A_b, B, C, D, delta_bias, scan_intermediates_f, scan_intermediates_b, out_f, out_b) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(),
+                              "d (b l) -> b d l", l = L)
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z_f = selective_scan_cuda.bwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, dout_y, scan_intermediates_f, out_f, dz,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        # flip one
+        dz_b = torch.empty_like(dz)
+        dconv1d_out_f_b, ddelta_f_b, dA_b, dB_f_b, dC_f_b, dD_b, ddelta_bias_b, dz_b, out_z_b = selective_scan_cuda.bwd(
+            conv1d_out.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, dout_y.flip([-1]), scan_intermediates_b, out_b, dz_b,
+            ctx.delta_softplus,
+            True  # option to recompute out_z
+        )
+        dconv1d_out = dconv1d_out + dconv1d_out_f_b.flip([-1])
+        ddelta = ddelta + ddelta_f_b.flip([-1])
+        dB = dB + dB_f_b.flip([-1])
+        dC = dC + dC_f_b.flip([-1])
+        dD = dD + dD_b
+        ddelta_bias = ddelta_bias + ddelta_bias_b
+        dz = dz + dz_b.flip([-1])
+        out_z = out_z_f + out_z_b.flip([-1])
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank:delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, dx, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (dxz, dconv1d_weight, dconv1d_bias, dx_proj_weight, ddelta_proj_weight,
+                dout_proj_weight, dout_proj_bias,
+                dA, dA_b, dB, dC, dD,
+                ddelta_bias if delta_bias is not None else None,
+                dB_proj_bias, dC_proj_bias, None)
+def mamba_inner_fn(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return MambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              out_proj_weight, out_proj_bias,
+                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+def bimamba_inner_fn(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return BiMambaInnerFn.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              out_proj_weight, out_proj_bias,
+                              A, A_b, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+def mamba_inner_fn_no_out_proj(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    return MambaInnerFnNoOutProj.apply(xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+                              A, B, C, D, delta_bias, B_proj_bias, C_proj_bias, delta_softplus)
+def mamba_inner_ref(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    L = xz.shape[-1]
+    delta_rank = delta_proj_weight.shape[1]
+    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+    x, z = xz.chunk(2, dim=1)
+    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, "silu")
+    # We're being very careful here about the layout, to avoid extra transposes.
+    # We want delta to have d as the slowest moving dimension
+    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
+    delta = rearrange(delta, "d (b l) -> b d l", l=L)
+    if B is None:  # variable B
+        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
+        if B_proj_bias is not None:
+            B = B + B_proj_bias.to(dtype=B.dtype)
+        if not A.is_complex():
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    if C is None:  # variable B
+        C = x_dbl[:, -d_state:]  # (bl d)
+        if C_proj_bias is not None:
+            C = C + C_proj_bias.to(dtype=C.dtype)
+        if not A.is_complex():
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
+    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+def bimamba_inner_ref(
+    xz, conv1d_weight, conv1d_bias, x_proj_weight, delta_proj_weight,
+    out_proj_weight, out_proj_bias,
+    A, A_b, B=None, C=None, D=None, delta_bias=None, B_proj_bias=None,
+    C_proj_bias=None, delta_softplus=True
+):
+    L = xz.shape[-1]
+    delta_rank = delta_proj_weight.shape[1]
+    d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+    x, z = xz.chunk(2, dim=1)
+    x = causal_conv1d_fn(x, rearrange(conv1d_weight, "d 1 w -> d w"), conv1d_bias, "silu")
+    # We're being very careful here about the layout, to avoid extra transposes.
+    # We want delta to have d as the slowest moving dimension
+    # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+    x_dbl = F.linear(rearrange(x, 'b d l -> (b l) d'), x_proj_weight)  # (bl d)
+    delta = delta_proj_weight @ x_dbl[:, :delta_rank].t()
+    delta = rearrange(delta, "d (b l) -> b d l", l=L)
+    if B is None:  # variable B
+        B = x_dbl[:, delta_rank:delta_rank + d_state]  # (bl d)
+        if B_proj_bias is not None:
+            B = B + B_proj_bias.to(dtype=B.dtype)
+        if not A.is_complex():
+            B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            B = rearrange(B, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    if C is None:  # variable B
+        C = x_dbl[:, -d_state:]  # (bl d)
+        if C_proj_bias is not None:
+            C = C + C_proj_bias.to(dtype=C.dtype)
+        if not A.is_complex():
+            C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+        else:
+            C = rearrange(C, "(b l) (dstate two) -> b dstate (l two)", l=L, two=2).contiguous()
+    y = selective_scan_fn(x, delta, A, B, C, D, z=z, delta_bias=delta_bias, delta_softplus=True)
+    y_b = selective_scan_fn(x.flip([-1]), delta.flip([-1]), A_b, B.flip([-1]), C.flip([-1]), D, z.flip([-1]), delta_bias, delta_softplus=True)
+    y = y + y_b.flip([-1])
+    return F.linear(rearrange(y, "b d l -> b l d"), out_proj_weight, out_proj_bias)

model/patchify.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn as nn
+class Patchify(nn.Module):
+    def __init__(self, in_channels, out_channels, patch_size):
+        super(Patchify, self).__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=(8, patch_size), stride=(8, patch_size), padding=0, bias=False)
+    def forward(self, x):
+        # x.shape = (batch_size, channels, height, width)
+        x = self.conv(x)
+        return x
+if __name__ == "__main__":
+    model = Patchify(1, 32, 2)
+    print(model)
+    dummy_input = torch.randn(1, 1, 64, 16)
+    output = model(dummy_input)
+    print(output.shape)

model/sinc_conv.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import torch.fft
+import sys
+from torch.autograd import Variable
+import math
+class GlobalLayerNorm(nn.Module):
+    '''
+       Calculate Global Layer Normalization
+       dim: (int or list or torch.Size) –
+            input shape from an expected input of size
+       eps: a value added to the denominator for numerical stability.
+       elementwise_affine: a boolean value that when set to True,
+           this module has learnable per-element affine parameters
+           initialized to ones (for weights) and zeros (for biases).
+    '''
+    def __init__(self, dim, eps=1e-05, elementwise_affine=True):
+        super(GlobalLayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(self.dim, 1))
+            self.bias = nn.Parameter(torch.zeros(self.dim, 1))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+    def forward(self, x):
+        # x = N x C x L
+        # N x 1 x 1
+        # cln: mean,var N x 1 x L
+        # gln: mean,var N x 1 x 1
+        if x.dim() != 3:
+            raise RuntimeError("{} accept 3D tensor as input".format(
+                self.__name__))
+        mean = torch.mean(x, (1, 2), keepdim=True)
+        var = torch.mean((x-mean)**2, (1, 2), keepdim=True)
+        # N x C x L
+        if self.elementwise_affine:
+            x = self.weight*(x-mean)/torch.sqrt(var+self.eps)+self.bias
+        else:
+            x = (x-mean)/torch.sqrt(var+self.eps)
+        return x
+class TimeSincExtractor(nn.Module):
+    """Sinc-based convolution
+    Parameters
+    ----------
+    in_channels : `int`
+        Number of input channels. Must be 1.
+    out_channels : `int`
+        Number of filters.
+    kernel_size : `int`
+        Filter length.
+    sample_rate : `int`, optional
+        Sample rate. Defaults to 16000.
+    triangular : `bool`
+        Squared sinc -> Triangular filter.
+    freq_nml : `bool`
+        Normalized to gain of 1 in frequency.
+    range_constraint : `bool`
+        Project the learned band within nyquist freq manually.
+    Usage
+    -----
+    See `torch.nn.Conv1d`
+    """
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def swap_(self, x, y, sort=False):
+        mini = torch.minimum(x, y)
+        maxi = torch.maximum(x, y)
+        if sort:
+            mini, idx = torch.sort(mini)
+            maxi = maxi[idx].view(mini.shape)
+        return mini, maxi
+    def __init__(self, out_channels, kernel_size, triangular=False,
+                 freq_nml=False, range_constraint=False, freq_init='uniform', norm_after=True, sample_rate=16000, in_channels=1,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50, bi_factor=False, frame_length=400, hop_length=160):
+        super(TimeSincExtractor,self).__init__()
+        if in_channels != 1:
+            # msg = (f'SincConv only support one input channel '
+            #       f'(here, in_channels = {in_channels:d}).')
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.triangular = False
+        self.freq_nml = False
+        # Forcing the filters to be odd (i.e, perfectly symmetrics)
+        if kernel_size%2 == 0:
+            self.kernel_size = self.kernel_size+1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        self.sample_rate = sample_rate
+        self.nyquist_rate = sample_rate/2
+        self.min_low_hz = min_low_hz
+        self.min_band_hz = min_band_hz
+        self.range_constraint = range_constraint
+        self.bi_factor = bi_factor
+        if self.range_constraint:
+            # msg = "Range constraint in learned frequency is not supported yet."
+            # raise ValueError(msg)
+            if freq_init == "uniform":
+                low_freq, high_freq = torch.rand(out_channels*2).chunk(2)
+            elif freq_init == "formant":
+                # raise NotImplementedError('Formant distribution hasn\'t been implemented yet.')
+                p = np.load('/share/nas165/Jasonho610/SincNet/exp/formant_distribution.npy')
+                low_freq, high_freq = torch.from_numpy(np.random.choice(8000, out_channels*2, p=p)).chunk(2)
+                low_freq = low_freq / self.nyquist_rate
+                high_freq = high_freq / self.nyquist_rate
+            elif freq_init == "mel":
+                # raise NotImplementedError('Mel distribution hasn\'t been implemented yet.')
+                low_hz = 30
+                high_hz = self.nyquist_rate - (self.min_low_hz + self.min_band_hz)
+                mel = np.linspace(self.to_mel(low_hz),
+                                  self.to_mel(high_hz),
+                                  self.out_channels + 1)
+                hz = self.to_hz(mel)
+                low_freq = torch.Tensor(hz[:-1]) / self.nyquist_rate
+                high_freq = torch.Tensor(hz[1:]) / self.nyquist_rate
+            else:
+                raise ValueError('SincConv must specify the freq initialization methods.')
+            low_freq, high_freq = self.swap_(low_freq, high_freq)
+            if self.bi_factor:
+                self.band_imp = nn.Parameter(torch.ones(out_channels))
+            self.low_f_ = nn.Parameter(low_freq.view(-1, 1))
+            self.high_f_ = nn.Parameter(high_freq.view(-1, 1))
+        else:
+            # initialize filterbanks such that they are equally spaced in Mel scale
+            low_hz = 30
+            high_hz = self.nyquist_rate - (self.min_low_hz + self.min_band_hz)
+            mel = np.linspace(self.to_mel(low_hz),
+                              self.to_mel(high_hz),
+                              self.out_channels + 1)
+            hz = self.to_hz(mel)
+            # filter lower frequency (out_channels, 1)
+            self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
+            # filter frequency band (out_channels, 1)
+            self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))
+        # Hamming window
+        # self.window_ = torch.hamming_window(self.kernel_size)
+        n_lin = torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2))) # computing only half of the window
+        self.window_ = 0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);
+        # (1, kernel_size/2)
+        n = (self.kernel_size - 1) / 2.0
+        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate # Due to symmetry, I only need half of the time axes
+        self.norm_after = norm_after
+        if self.norm_after:
+            self.ln = GlobalLayerNorm(out_channels)
+    def forward(self, waveforms, embedding):
+        """
+        Parameters
+        ----------
+        waveforms : `torch.Tensor` (batch_size, 1, n_samples)
+            Batch of waveforms.
+        Returns
+        -------
+        features : `torch.Tensor` (batch_size, out_channels, n_samples_out)
+            Batch of sinc filters activations.
+        """
+        self.n_ = self.n_.to(waveforms.device)
+        self.window_ = self.window_.to(waveforms.device)
+        # waveforms = waveforms.unsqueeze(1)
+        # print("Waveforms:", waveforms.shape)
+        framing_padding = self.frame_length - (waveforms.shape[-1] % self.hop_length)
+        waveforms = F.pad(waveforms, (0, framing_padding))
+        frames = waveforms.unfold(-1, self.frame_length, self.hop_length)
+        batch_size = frames.shape[0]
+        n_frames = frames.shape[2]
+        if self.range_constraint:
+            low_f_, high_f_ = self.swap_(torch.abs(self.low_f_), torch.abs(self.high_f_))
+            low  = self.min_low_hz + low_f_*self.nyquist_rate
+            high = torch.clamp(self.min_band_hz + high_f_*self.nyquist_rate, self.min_low_hz, self.nyquist_rate)
+            band = (high-low)[:,0]
+        else:
+            low  = self.min_low_hz + torch.abs(self.low_hz_)
+            high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_), self.min_low_hz, self.nyquist_rate)
+            band = (high-low)[:,0]
+        self.low = low
+        self.high = high
+        self.band = band
+        f_times_t_low = torch.matmul(low, self.n_)
+        f_times_t_high = torch.matmul(high, self.n_)
+        band_pass_left = ((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_ # Equivalent of Eq.4 of the reference paper (SPEAKER RECOGNITION FROM RAW WAVEFORM WITH SINCNET). I just have expanded the sinc and simplified the terms. This way I avoid several useless computations.
+        band_pass_center = 2*band.view(-1,1)
+        band_pass_right = torch.flip(band_pass_left,dims=[1])
+        band_pass = torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)
+        band_pass = band_pass / (2*band[:,None])
+        if self.triangular:
+            band_pass = band_pass**2
+        if self.freq_nml:
+            mag_resp = torch.fft.rfft(band_pass).abs()
+            mag_max = torch.max(mag_resp, dim=-1)[0]
+            band_pass = band_pass / mag_max.unsqueeze(-1)
+        if self.bi_factor:
+            band_imp = F.relu(self.band_imp)
+            band_pass = band_pass * band_imp.unsqueeze(-1)
+        self.filters = (band_pass).view(
+            self.out_channels, 1, self.kernel_size)
+        # print("Filters:", self.filters.shape)
+        # print("Frames:", frames.shape)
+        rs_frames = frames.reshape(batch_size*n_frames, 1, self.frame_length)
+        # print("Reshaped frames:", rs_frames.shape)
+        filtered = F.conv1d(rs_frames, self.filters, stride=self.stride,
+                        padding=self.padding, dilation=self.dilation,
+                        bias=None, groups=1)
+        # print('Pass conv1d')
+        # print("Filtered:", filtered.shape)
+        if self.norm_after:
+            filtered = self.ln(filtered)
+        # print("Normed filtered:", filtered.shape)
+        filtered = filtered.reshape(batch_size, n_frames, self.out_channels , -1)
+        # print("Final filtered:", filtered.shape)
+        energy = torch.mean(filtered**2, dim=-1)
+        log_filtered_energy = torch.log10(energy + 1e-6)
+        # print("Log filtered energy:", log_filtered_energy.shape)  # (batch_size, n_samples_out(time), out_channels(frequency))
+        log_filtered_energy = log_filtered_energy.unsqueeze(1)
+        # print("Unsqueezed log filtered energy:", log_filtered_energy.shape)  # (batch_size, channels, n_samples_out(time), out_channels(frequency))
+        log_filtered_energy = log_filtered_energy.permute(0, 1, 3, 2)
+        # print("Permuted log filtered energy:", log_filtered_energy.shape)  # (batch_size, channels, out_channels(frequency), n_samples_out(time))
+        return log_filtered_energy, self.filters, self.stride, self.padding
+class FreqSincExtractor(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def swap_(self, x, y, sort=False):
+        mini = torch.minimum(x, y)
+        maxi = torch.maximum(x, y)
+        if sort:
+            mini, idx = torch.sort(mini)
+            maxi = maxi[idx].view(mini.shape)
+        return mini, maxi
+    def __init__(self, out_channels, kernel_size, triangular=False,
+                 freq_nml=False, range_constraint=False, freq_init='uniform',
+                 norm_after=True, sample_rate=16000, in_channels=1,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1,
+                 min_low_hz=50, min_band_hz=50, bi_factor=False,
+                 frame_length=400, hop_length=160, n_fft=400):
+        super(FreqSincExtractor, self).__init__()
+        if in_channels != 1:
+            msg = "FreqSincExtractor only supports one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.triangular = triangular
+        self.freq_nml = freq_nml
+        self.sample_rate = sample_rate
+        self.nyquist_rate = sample_rate/2
+        self.min_low_hz = min_low_hz
+        self.min_band_hz = min_band_hz
+        self.range_constraint = range_constraint
+        self.bi_factor = bi_factor
+        self.frame_length = frame_length
+        self.hop_length = hop_length
+        self.n_fft = n_fft
+        self.stride = stride
+        self.padding = padding
+        self.output_size = 64
+        # Initialize frequency bands
+        if self.range_constraint:
+            if freq_init == "uniform":
+                low_freq, high_freq = torch.rand(out_channels*2).chunk(2)
+            elif freq_init == "mel":
+                low_hz = 30
+                high_hz = self.nyquist_rate - (self.min_low_hz + self.min_band_hz)
+                mel = np.linspace(self.to_mel(low_hz),
+                                self.to_mel(high_hz),
+                                self.out_channels + 1)
+                hz = self.to_hz(mel)
+                low_freq = torch.Tensor(hz[:-1]) / self.nyquist_rate
+                high_freq = torch.Tensor(hz[1:]) / self.nyquist_rate
+            else:
+                raise ValueError('FreqSincExtractor must specify the freq initialization methods.')
+            low_freq, high_freq = self.swap_(low_freq, high_freq)
+            if self.bi_factor:
+                self.band_imp = nn.Parameter(torch.ones(out_channels))
+            self.low_f_ = nn.Parameter(low_freq.view(-1, 1))
+            self.high_f_ = nn.Parameter(high_freq.view(-1, 1))
+        else:
+            low_hz = 30
+            high_hz = self.nyquist_rate - (self.min_low_hz + self.min_band_hz)
+            mel = np.linspace(self.to_mel(low_hz),
+                            self.to_mel(high_hz),
+                            self.out_channels + 1)
+            hz = self.to_hz(mel)
+            self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
+            self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))
+        # Frequency axis for STFT
+        self.freq_axis = torch.linspace(0, self.nyquist_rate, self.n_fft//2 + 1)
+        self.norm_after = norm_after
+        if self.norm_after:
+            self.ln = GlobalLayerNorm(out_channels)
+    def get_filters(self):
+        if self.range_constraint:
+            low_f_, high_f_ = self.swap_(torch.abs(self.low_f_), torch.abs(self.high_f_))
+            low = self.min_low_hz + low_f_ * self.nyquist_rate
+            high = torch.clamp(self.min_low_hz + high_f_ * self.nyquist_rate,
+                             self.min_low_hz, self.nyquist_rate)
+        else:
+            low = self.min_low_hz + torch.abs(self.low_hz_)
+            high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),
+                             self.min_low_hz, self.nyquist_rate)
+        # Create frequency domain filters
+        freq_axis = self.freq_axis.to(low.device)
+        filters = torch.zeros((self.out_channels, len(freq_axis))).to(low.device)
+        for i in range(self.out_channels):
+            mask = (freq_axis >= low[i]) & (freq_axis <= high[i])
+            filters[i, mask] = 1.0
+            if self.triangular:
+                center_freq = (low[i] + high[i]) / 2
+                bandwidth = high[i] - low[i]
+                mask = (freq_axis >= low[i]) & (freq_axis <= high[i])
+                freq_response = 1.0 - torch.abs(freq_axis[mask] - center_freq) / (bandwidth/2)
+                filters[i, mask] = freq_response
+        if self.freq_nml:
+            filters = F.normalize(filters, p=2, dim=1)
+        if self.bi_factor:
+            band_imp = F.relu(self.band_imp)
+            filters = filters * band_imp.unsqueeze(-1)
+        return filters
+    def forward(self, waveforms, embedding=None):
+        batch_size = waveforms.shape[0]
+        # Calculate necessary padding to achieve the correct output size
+        target_length = self.hop_length * (self.output_size - 1) + self.frame_length
+        current_length = waveforms.shape[-1]
+        padding_needed = target_length - current_length
+        # Pad the input if necessary
+        if padding_needed > 0:
+            waveforms = F.pad(waveforms, (0, padding_needed))
+        # Compute STFT
+        stft = torch.stft(waveforms.squeeze(1),
+                        n_fft=self.n_fft,
+                        hop_length=self.hop_length,
+                        win_length=self.frame_length,
+                        window=torch.hann_window(self.frame_length).to(waveforms.device),
+                        return_complex=True)
+        # Get magnitude spectrogram
+        mag_spec = torch.abs(stft)  # (batch_size, freq_bins, time_frames)
+        # Get and apply filters
+        filters = self.get_filters()  # (out_channels, freq_bins)
+        filtered = torch.matmul(filters, mag_spec)  # (batch_size, out_channels, time_frames)
+        if self.norm_after:
+            filtered = self.ln(filtered)
+        # Compute log energy
+        energy = filtered ** 2
+        log_energy = torch.log10(energy + 1e-6)
+        # Ensure correct time dimension
+        if log_energy.shape[-1] != self.output_size:
+            log_energy = F.interpolate(
+                log_energy,
+                size=self.output_size,
+                mode='linear',
+                align_corners=False
+            )
+        # Reshape to the desired output format
+        log_energy = log_energy.unsqueeze(1)  # Add channel dimension
+        log_energy = log_energy.permute(0, 1, 3, 2)  # Rearrange to (batch, channel, freq, time)
+        return log_energy, filters, self.stride, self.padding
+if __name__ == "__main__":
+    batch_size = 256
+    n_samples = 10080
+    waveforms = torch.rand(batch_size, 1, n_samples)
+    # model = TimeSincExtractor(out_channels=64, kernel_size=101, range_constraint=True, stride=2)
+    model = FreqSincExtractor(out_channels=64, kernel_size=101, range_constraint=True, stride=2)
+    print(model)
+    outputs, _, _, _ = model(waveforms, embedding=None)
+    print("Outputs:", outputs.shape)

model/tiny_block.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn as nn
+class TinyBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, dilation=2):
+        super(TinyBlock, self).__init__()
+        # f1: 3x3 depthwise convolution + BatchNorm
+        self.f1 = nn.Sequential(
+            nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False),
+            nn.BatchNorm2d(in_channels)
+        )
+        # f2: 1x1 grouped pointwise convolutions with 8 groups + ReLU
+        self.f2 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=8, bias=False),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        f1_out = self.f1(x)
+        f2_out = self.f2(x + f1_out)
+        out = x + f1_out + f2_out
+        return out
+if __name__ == "__main__":
+    model = TinyBlock(16, 16)
+    print(model)
+    dummy_input = torch.randn(256, 16, 8, 8)
+    output = model(dummy_input)
+    print(output.shape)

model/tinyvad.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import torch.nn as nn
+from .sinc_conv import TimeSincExtractor, FreqSincExtractor
+from .patchify import Patchify
+from .csp_tiny_layer import CSPTinyLayer
+class TinyVAD(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, patch_size, num_blocks, sinc_conv, ssm):
+        super(TinyVAD, self).__init__()
+        self.sinc_conv = sinc_conv
+        if self.sinc_conv:
+            # self.extractor = TimeSincExtractor(out_channels=64, kernel_size=101, range_constraint=True, stride=2)
+            self.extractor = FreqSincExtractor(out_channels=64, kernel_size=101, range_constraint=True, stride=2)
+        self.patchify = Patchify(in_channels, hidden_channels, patch_size)
+        self.csp_tiny_layer1 = CSPTinyLayer(hidden_channels, hidden_channels, num_blocks, ssm)
+        self.csp_tiny_layer2 = CSPTinyLayer(hidden_channels, hidden_channels, num_blocks, ssm)
+        self.csp_tiny_layer3 = CSPTinyLayer(hidden_channels, out_channels, num_blocks, ssm)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Linear(out_channels, 1),
+            # nn.Sigmoid()
+        )
+    def forward(self, x):
+        if self.sinc_conv:
+            x = self.extractor(x, None)
+            x = x[0]  # Untuple
+        x = self.patchify(x)
+        x = self.csp_tiny_layer1(x)
+        x = self.csp_tiny_layer2(x)
+        x = self.csp_tiny_layer3(x)
+        x = self.avg_pool(x).view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+    def predict(self, inputs):
+        logits = self.forward(inputs)
+        probs = torch.sigmoid(logits)
+        return probs
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    model = TinyVAD(1, 32, 64, 2, 2, False, False).to(device)
+    print(model)
+    dummy_input = torch.randn(1, 1, 64, 16).to(device)
+    output = model(dummy_input)
+    print(output)