Spaces:

nari-labs
/

Dia-1.6B

Running on Zero

App Files Files Community

buttercrab commited on 4 days ago

Commit

4aa0f34

1 Parent(s): 9577cb2

update to faster inference

Browse files

Files changed (6) hide show

app.py +17 -31
dia/audio.py +27 -104
dia/config.py +17 -26
dia/layers.py +106 -337
dia/model.py +314 -257
dia/state.py +234 -0

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import argparse
 import tempfile
 import time
 from pathlib import Path
 from typing import Optional, Tuple
-import spaces
 import gradio as gr
 import numpy as np
@@ -12,40 +10,17 @@ import torch
 from dia.model import Dia
-# --- Global Setup ---
-parser = argparse.ArgumentParser(description="Gradio interface for Nari TTS")
-parser.add_argument(
-    "--device", type=str, default=None, help="Force device (e.g., 'cuda', 'mps', 'cpu')"
-)
-parser.add_argument("--share", action="store_true", help="Enable Gradio sharing")
-args = parser.parse_args()
-# Determine device
-if args.device:
-    device = torch.device(args.device)
-elif torch.cuda.is_available():
-    device = torch.device("cuda")
-# Simplified MPS check for broader compatibility
-elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-    # Basic check is usually sufficient, detailed check can be problematic
-    device = torch.device("mps")
-else:
-    device = torch.device("cpu")
-print(f"Using device: {device}")
 # Load Nari model and config
 print("Loading Nari model...")
 try:
     # Use the function from inference.py
-    model = Dia.from_pretrained("nari-labs/Dia-1.6B")
 except Exception as e:
     print(f"Error loading Nari model: {e}")
     raise
-@spaces.GPU
 def run_inference(
     text_input: str,
     audio_prompt_input: Optional[Tuple[int, np.ndarray]],
@@ -60,7 +35,7 @@ def run_inference(
     Runs Nari inference using the globally loaded model and provided inputs.
     Uses temporary files for text and audio prompt compatibility with inference.generate.
     """
-    # global model, device  # Access global model, config, device
     if not text_input or text_input.isspace():
         raise gr.Error("Text input cannot be empty.")
@@ -146,10 +121,9 @@ def run_inference(
                 cfg_scale=cfg_scale,
                 temperature=temperature,
                 top_p=top_p,
-                use_cfg_filter=True,
                 cfg_filter_top_k=cfg_filter_top_k,  # Pass the value here
                 use_torch_compile=False,  # Keep False for Gradio stability
-                audio_prompt_path=prompt_path_for_generate,
             )
         end_time = time.time()
@@ -192,6 +166,16 @@ def run_inference(
                 f"Audio conversion successful. Final shape: {output_audio[1].shape}, Sample Rate: {output_sr}"
             )
         else:
             print("\nGeneration finished, but no valid tokens were produced.")
             # Return default silence
@@ -383,8 +367,10 @@ with gr.Blocks(css=css) as demo:
     else:
         gr.Markdown("_(No examples configured or example prompt file missing)_")
 # --- Launch the App ---
 if __name__ == "__main__":
     print("Launching Gradio interface...")
     demo.launch()

 import tempfile
 import time
 from pathlib import Path
 from typing import Optional, Tuple
 import gradio as gr
 import numpy as np
 from dia.model import Dia
 # Load Nari model and config
 print("Loading Nari model...")
 try:
     # Use the function from inference.py
+    model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="bfloat16")
 except Exception as e:
     print(f"Error loading Nari model: {e}")
     raise
 def run_inference(
     text_input: str,
     audio_prompt_input: Optional[Tuple[int, np.ndarray]],
     Runs Nari inference using the globally loaded model and provided inputs.
     Uses temporary files for text and audio prompt compatibility with inference.generate.
     """
+    global model, device  # Access global model, config, device
     if not text_input or text_input.isspace():
         raise gr.Error("Text input cannot be empty.")
                 cfg_scale=cfg_scale,
                 temperature=temperature,
                 top_p=top_p,
                 cfg_filter_top_k=cfg_filter_top_k,  # Pass the value here
                 use_torch_compile=False,  # Keep False for Gradio stability
+                audio_prompt=prompt_path_for_generate,
             )
         end_time = time.time()
                 f"Audio conversion successful. Final shape: {output_audio[1].shape}, Sample Rate: {output_sr}"
             )
+            # Explicitly convert to int16 to prevent Gradio warning
+            if (
+                output_audio[1].dtype == np.float32
+                or output_audio[1].dtype == np.float64
+            ):
+                audio_for_gradio = np.clip(output_audio[1], -1.0, 1.0)
+                audio_for_gradio = (audio_for_gradio * 32767).astype(np.int16)
+                output_audio = (output_sr, audio_for_gradio)
+                print("Converted audio to int16 for Gradio output.")
         else:
             print("\nGeneration finished, but no valid tokens were produced.")
             # Return default silence
     else:
         gr.Markdown("_(No examples configured or example prompt file missing)_")
 # --- Launch the App ---
 if __name__ == "__main__":
     print("Launching Gradio interface...")
+    # set `GRADIO_SERVER_NAME`, `GRADIO_SERVER_PORT` env vars to override default values
+    # use `GRADIO_SERVER_NAME=0.0.0.0` for Docker
     demo.launch()

dia/audio.py CHANGED Viewed

@@ -2,10 +2,10 @@ import typing as tp
 import torch
-from .config import DataConfig
-def build_delay_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """
     Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
     Negative t_idx => BOS; t_idx >= T => PAD.
@@ -69,7 +69,9 @@ def apply_audio_delay(
     # Equivalent of tf.gather_nd using advanced indexing
     # Ensure indices are long type if not already (build_delay_indices should handle this)
-    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
     gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
     # Create masks on the correct device
@@ -82,65 +84,16 @@ def apply_audio_delay(
     # If mask_bos, BOS; else if mask_pad, PAD; else original gather
     # All tensors should now be on the same device
-    result_BxTxC = torch.where(mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC))
-    return result_BxTxC
-@torch.no_grad()
-@torch.inference_mode()
-def audio_to_codebook(
-    model,
-    input_values,
-    data_config: DataConfig,
-    padding_mask=None,
-    sample_rate=44100,
-):
-    """
-    Encodes the input audio waveform into discrete codes.
-    Args:
-        model: The model to use for encoding.
-        input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
-            Float values of the input audio waveform.
-        padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
-            Padding mask used to pad the `input_values`.
-        sample_rate (`int`, *optional*) :
-            Signal sampling_rate
-    Returns:
-        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
-        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
-        `codebook` of shape `[batch_size, num_codebooks, frames]`.
-        Scale is not used here.
-    """
-    audio_data = model.preprocess(input_values, sample_rate)
-    if padding_mask is None:
-        padding_mask = torch.ones_like(input_values).bool()
-    _, encoded_frame, _, _, _ = model.encode(audio_data, n_quantizers=None)  # 1, C, T
-    seq_length = encoded_frame.shape[2]
-    t_idx_BxTxC, indices_BTCx3 = build_delay_indices(
-        B=1,
-        T=seq_length,
-        C=data_config.channels,
-        delay_pattern=data_config.delay_pattern,
     )
-    encoded_frame = apply_audio_delay(
-        audio_BxTxC=encoded_frame.transpose(1, 2),  # 1, T, C
-        pad_value=data_config.audio_pad_value,
-        bos_value=data_config.audio_bos_value,
-        precomp=(t_idx_BxTxC, indices_BTCx3),
-    )
-    return encoded_frame
-def build_revert_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """
     Precompute indices for the revert operation using PyTorch.
@@ -162,8 +115,12 @@ def build_revert_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) ->
         t_idx_BT1 + delay_arr.view(1, 1, C),
         torch.tensor(T - 1, device=device),
     )
-    b_idx_BxTxC = torch.broadcast_to(torch.arange(B, device=device).view(B, 1, 1), [B, T, C])
-    c_idx_BxTxC = torch.broadcast_to(torch.arange(C, device=device).view(1, 1, C), [B, T, C])
     indices_BTCx3 = torch.stack(
         [
@@ -205,15 +162,21 @@ def revert_audio_delay(
     indices_BTCx3 = indices_BTCx3.to(device)
     # Using PyTorch advanced indexing (equivalent to tf.gather_nd or np equivalent)
-    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
-    gathered_BxTxC = gathered_flat.view(audio_BxTxC.size())  # Use .size() for robust reshaping
     # Create pad_tensor on the correct device
     pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
     # Create T tensor on the correct device for comparison
     T_tensor = torch.tensor(T, device=device)
-    result_BxTxC = torch.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)  # Changed np.where to torch.where
     return result_BxTxC
@@ -238,43 +201,3 @@ def decode(
     except Exception as e:
         print(f"Error in decode method: {str(e)}")
         raise
-def codebook_to_audio(generated_codes: torch.Tensor, model, delay_pattern, B=1, T=2600, C=9):
-    """Process a single codebook file to generate audio"""
-    # Remove BOS token
-    generated_codes = generated_codes[:, 1:]
-    if generated_codes.shape[1] > T:
-        generated_codes = generated_codes[:, :T]
-    seq_length = generated_codes.shape[1]
-    # Build revert indices
-    t_idx_BxTxC, indices_BTCx3 = build_revert_indices(B=B, T=seq_length, C=C, delay_pattern=delay_pattern)
-    # Transpose and add batch dimension
-    audio_BxTxC = generated_codes.transpose(1, 0).unsqueeze(0)
-    reverted_codebook = revert_audio_delay(
-        audio_BxTxC=audio_BxTxC,
-        pad_value=0,
-        precomp=(t_idx_BxTxC, indices_BTCx3),
-        T=seq_length,
-    )
-    reverted_codebook = reverted_codebook[:, :-30, :]
-    codebook = reverted_codebook.transpose(1, 2)
-    min_valid_index = 0
-    max_valid_index = 1023
-    invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
-    num_invalid = torch.sum(invalid_mask).item()
-    if num_invalid > 0:
-        print(f"Warning: Clamping {num_invalid} indices outside range [{min_valid_index}, {max_valid_index}] to 0.")
-    # Set invalid values to 0 (modify the tensor in-place)
-    codebook[invalid_mask] = 0
-    audio_array = decode(model, codebook)
-    return audio_array

 import torch
+def build_delay_indices(
+    B: int, T: int, C: int, delay_pattern: tp.List[int]
+) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """
     Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
     Negative t_idx => BOS; t_idx >= T => PAD.
     # Equivalent of tf.gather_nd using advanced indexing
     # Ensure indices are long type if not already (build_delay_indices should handle this)
+    gathered_flat = audio_BxTxC[
+        indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]
+    ]
     gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
     # Create masks on the correct device
     # If mask_bos, BOS; else if mask_pad, PAD; else original gather
     # All tensors should now be on the same device
+    result_BxTxC = torch.where(
+        mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC)
     )
+    return result_BxTxC
+def build_revert_indices(
+    B: int, T: int, C: int, delay_pattern: tp.List[int]
+) -> tp.Tuple[torch.Tensor, torch.Tensor]:
     """
     Precompute indices for the revert operation using PyTorch.
         t_idx_BT1 + delay_arr.view(1, 1, C),
         torch.tensor(T - 1, device=device),
     )
+    b_idx_BxTxC = torch.broadcast_to(
+        torch.arange(B, device=device).view(B, 1, 1), [B, T, C]
+    )
+    c_idx_BxTxC = torch.broadcast_to(
+        torch.arange(C, device=device).view(1, 1, C), [B, T, C]
+    )
     indices_BTCx3 = torch.stack(
         [
     indices_BTCx3 = indices_BTCx3.to(device)
     # Using PyTorch advanced indexing (equivalent to tf.gather_nd or np equivalent)
+    gathered_flat = audio_BxTxC[
+        indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]
+    ]
+    gathered_BxTxC = gathered_flat.view(
+        audio_BxTxC.size()
+    )  # Use .size() for robust reshaping
     # Create pad_tensor on the correct device
     pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
     # Create T tensor on the correct device for comparison
     T_tensor = torch.tensor(T, device=device)
+    result_BxTxC = torch.where(
+        t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC
+    )  # Changed np.where to torch.where
     return result_BxTxC
     except Exception as e:
         print(f"Error in decode method: {str(e)}")
         raise

dia/config.py CHANGED Viewed

@@ -33,14 +33,20 @@ class DataConfig(BaseModel, frozen=True):
         delay_pattern: List of delay values for each audio channel.
     """
-    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
-    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
     channels: int = Field(default=9, gt=0, multiple_of=1)
     text_pad_value: int = Field(default=0)
     audio_eos_value: int = Field(default=1024)
     audio_pad_value: int = Field(default=1025)
     audio_bos_value: int = Field(default=1026)
-    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15])
     def __hash__(self) -> int:
         """Generate a hash based on all fields of the config."""
@@ -67,8 +73,6 @@ class EncoderConfig(BaseModel, frozen=True):
         n_hidden: Hidden dimension size in the MLP layers.
         n_head: Number of attention heads.
         head_dim: Dimension per attention head.
-        mlp_activations: List of activation functions for the MLP layers.
-        use_pre_norm: Whether to use pre-normalization (LayerNorm before attention/MLP).
     """
     n_layer: int = Field(gt=0)
@@ -76,8 +80,6 @@ class EncoderConfig(BaseModel, frozen=True):
     n_hidden: int = Field(gt=0)
     n_head: int = Field(gt=0)
     head_dim: int = Field(gt=0)
-    mlp_activations: list[str] = Field(default=["silu", "linear"])
-    use_pre_norm: bool = Field(default=False)
 class DecoderConfig(BaseModel, frozen=True):
@@ -92,8 +94,6 @@ class DecoderConfig(BaseModel, frozen=True):
         gqa_head_dim: Dimension per query head for grouped-query self-attention.
         cross_query_heads: Number of query heads for cross-attention.
         cross_head_dim: Dimension per cross-attention head.
-        mlp_activations: List of activation functions for the MLP layers.
-        use_pre_norm: Whether to use pre-normalization.
     """
     n_layer: int = Field(gt=0)
@@ -104,8 +104,6 @@ class DecoderConfig(BaseModel, frozen=True):
     gqa_head_dim: int = Field(gt=0)
     cross_query_heads: int = Field(gt=0)
     cross_head_dim: int = Field(gt=0)
-    mlp_activations: list[str] = Field(default=["silu", "linear"])
-    use_pre_norm: bool = Field(default=False)
 class ModelConfig(BaseModel, frozen=True):
@@ -130,24 +128,16 @@ class ModelConfig(BaseModel, frozen=True):
     dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
     normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
     weight_dtype: str = Field(default="float32", description="Weight precision")
-    rope_min_timescale: int = Field(default=1, description="Timescale For global Attention")
-    rope_max_timescale: int = Field(default=10_000, description="Timescale For global Attention")
 class TrainingConfig(BaseModel, frozen=True):
-    """Training process configuration and hyperparameters.
-    Note: This configuration currently only includes precision settings.
-    Other training parameters (like batch size, learning rate, optimizer settings)
-    are assumed to be handled externally.
-    Attributes:
-        dtype: Data type for activations during training (e.g., "bfloat16", "float32").
-        logits_dot_in_fp32: Whether to compute the final logits dot product in fp32 for stability.
-    """
-    dtype: str = Field(default="bfloat16", description="Activation precision")
-    logits_dot_in_fp32: bool = Field(default=False)
 class DiaConfig(BaseModel, frozen=True):
@@ -164,6 +154,7 @@ class DiaConfig(BaseModel, frozen=True):
     version: str = Field(default="1.0")
     model: ModelConfig
     training: TrainingConfig
     data: DataConfig

         delay_pattern: List of delay values for each audio channel.
     """
+    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = (
+        Field(gt=0, multiple_of=128)
+    )
+    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = (
+        Field(gt=0, multiple_of=128)
+    )
     channels: int = Field(default=9, gt=0, multiple_of=1)
     text_pad_value: int = Field(default=0)
     audio_eos_value: int = Field(default=1024)
     audio_pad_value: int = Field(default=1025)
     audio_bos_value: int = Field(default=1026)
+    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(
+        default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15]
+    )
     def __hash__(self) -> int:
         """Generate a hash based on all fields of the config."""
         n_hidden: Hidden dimension size in the MLP layers.
         n_head: Number of attention heads.
         head_dim: Dimension per attention head.
     """
     n_layer: int = Field(gt=0)
     n_hidden: int = Field(gt=0)
     n_head: int = Field(gt=0)
     head_dim: int = Field(gt=0)
 class DecoderConfig(BaseModel, frozen=True):
         gqa_head_dim: Dimension per query head for grouped-query self-attention.
         cross_query_heads: Number of query heads for cross-attention.
         cross_head_dim: Dimension per cross-attention head.
     """
     n_layer: int = Field(gt=0)
     gqa_head_dim: int = Field(gt=0)
     cross_query_heads: int = Field(gt=0)
     cross_head_dim: int = Field(gt=0)
 class ModelConfig(BaseModel, frozen=True):
     dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
     normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
     weight_dtype: str = Field(default="float32", description="Weight precision")
+    rope_min_timescale: int = Field(
+        default=1, description="Timescale For global Attention"
+    )
+    rope_max_timescale: int = Field(
+        default=10_000, description="Timescale For global Attention"
+    )
 class TrainingConfig(BaseModel, frozen=True):
+    pass
 class DiaConfig(BaseModel, frozen=True):
     version: str = Field(default="1.0")
     model: ModelConfig
+    # TODO: remove training. this is just for backwards-compatability
     training: TrainingConfig
     data: DataConfig

dia/layers.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Any
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -7,26 +5,13 @@ from torch import Tensor
 from torch.nn import RMSNorm
 from .config import DiaConfig
 def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
     return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
-def _str_to_dtype(dtype_str: str) -> torch.dtype | None:
-    # Allow None for default behavior
-    if dtype_str is None or dtype_str.lower() == "none":
-        return None
-    if dtype_str == "float32":
-        return torch.float32
-    elif dtype_str == "float16":
-        return torch.float16
-    elif dtype_str == "bfloat16":
-        return torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported dtype string: {dtype_str}")
 class DenseGeneral(nn.Module):
     """
     PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
@@ -50,7 +35,6 @@ class DenseGeneral(nn.Module):
         in_shapes: tuple[int, ...],
         out_features: tuple[int, ...],
         axis: tuple[int, ...] = (-1,),
-        dtype: torch.dtype | None = None,
         weight_dtype: torch.dtype | None = None,
         device: torch.device | None = None,
     ):
@@ -58,7 +42,6 @@ class DenseGeneral(nn.Module):
         self.in_shapes = in_shapes
         self.out_features = out_features
         self.axis = axis
-        self.dtype = dtype
         self.kernel_shape = self.in_shapes + self.out_features
         factory_kwargs = {"device": device, "dtype": weight_dtype}
@@ -70,95 +53,44 @@ class DenseGeneral(nn.Module):
         kernel_contract_axes = tuple(range(len(norm_axis)))
         output = torch.tensordot(
-            inputs.float(),
-            self.weight.float(),
             dims=(norm_axis, kernel_contract_axes),
         ).to(inputs.dtype)
         return output
-def get_activation_fn(activation_string: str) -> nn.Module:  # Return Module instance
-    """Maps activation string to PyTorch activation function module."""
-    if activation_string == "gelu":
-        return nn.GELU()
-    elif activation_string == "relu":
-        return nn.ReLU()
-    elif activation_string == "silu" or activation_string == "swish":
-        return nn.SiLU()
-    elif activation_string == "linear":
-        return nn.Identity()
-    else:
-        raise ValueError(f"Unsupported activation function: {activation_string}")
 class MlpBlock(nn.Module):
     """MLP block using DenseGeneral."""
     def __init__(
-        self,
-        config: DiaConfig,
-        embed_dim: int,
-        intermediate_dim: int,
-        dropout_rate: float,
-        activations: list[str] = ["silu", "linear"],
-        use_pre_norm: bool = False,
     ):
         super().__init__()
-        self.use_pre_norm = use_pre_norm
-        num_activations = len(activations)
-        compute_dtype = _str_to_dtype(config.training.dtype)
-        weight_dtype = _str_to_dtype(config.model.weight_dtype)
         self.dtype = compute_dtype
-        # Assume default device for now, could be passed in config
-        if use_pre_norm:
-            self.pre_norm = RMSNorm(
-                embed_dim,
-                eps=config.model.normalization_layer_epsilon,
-                dtype=torch.float32,
-            )
         self.wi_fused = DenseGeneral(
             in_shapes=(embed_dim,),
-            out_features=(
-                num_activations,
-                intermediate_dim,
-            ),
             axis=(-1,),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
-        self.activation_fn_0 = get_activation_fn(activations[0])  # silu
-        self.activation_fn_1 = get_activation_fn(activations[1])  # linear
-        self.dropout = nn.Dropout(dropout_rate)
-        # Output layer using DenseGeneral
         self.wo = DenseGeneral(
             in_shapes=(intermediate_dim,),
             out_features=(embed_dim,),
             axis=(-1,),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
-    def forward(self, x: torch.Tensor, deterministic: bool) -> torch.Tensor:
         """Forward pass."""
-        if self.use_pre_norm and hasattr(self, "pre_norm"):
-            x = self.pre_norm(x)
         fused_x = self.wi_fused(x)
-        gate_input = fused_x[..., 0, :]
-        up_input = fused_x[..., 1, :]
-        gate = self.activation_fn_0(gate_input)
-        up = self.activation_fn_1(up_input)
-        hidden = torch.mul(gate, up).to(self.dtype)
-        if not deterministic:
-            hidden = self.dropout(hidden)
         output = self.wo(hidden)
         return output
@@ -207,37 +139,6 @@ class RotaryEmbedding(nn.Module):
         return torch.cat((first_part, second_part), dim=-1)
-class KVCache:
-    def __init__(self, num_heads, max_len, head_dim, device, k=None, v=None):
-        self.k = torch.zeros((2, num_heads, max_len, head_dim), device=device) if k is None else k
-        self.v = torch.zeros((2, num_heads, max_len, head_dim), device=device) if v is None else v
-        self.current_idx = 0
-        self.max_len = max_len
-    def get_kv_for_attention(self, current_k, current_v):
-        if self.current_idx == 0:
-            return current_k, current_v
-        else:
-            past_k = self.k[:, :, : self.current_idx, :]
-            past_v = self.v[:, :, : self.current_idx, :]
-            attn_k = torch.cat((past_k, current_k), dim=2)
-            attn_v = torch.cat((past_v, current_v), dim=2)
-            return attn_k, attn_v
-    def update_cache(self, k, v):
-        assert self.current_idx < self.max_len
-        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
-        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
-        self.current_idx += 1
-    def prefill_kv(self, k, v):
-        prefill_len = k.shape[2]
-        assert prefill_len <= self.max_len
-        self.k[:, :, :prefill_len, :] = k
-        self.v[:, :, :prefill_len, :] = v
-        self.current_idx = prefill_len
 class Attention(nn.Module):
     """Attention using DenseGeneral."""
@@ -249,7 +150,7 @@ class Attention(nn.Module):
         num_query_heads: int,
         num_kv_heads: int,
         head_dim: int,
-        dropout_rate: float,
         is_cross_attn: bool = False,
         out_embed_dim: int | None = None,
     ):
@@ -258,13 +159,12 @@ class Attention(nn.Module):
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
         self.is_cross_attn = is_cross_attn
-        self.dropout_rate = dropout_rate
-        compute_dtype = _str_to_dtype(config.training.dtype)
-        weight_dtype = _str_to_dtype(config.model.weight_dtype)
         self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
         self.projected_query_dim = num_query_heads * head_dim
         if num_query_heads % num_kv_heads != 0:
-            raise ValueError(f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})")
         self.num_gqa_groups = num_query_heads // num_kv_heads
         # --- Projection Layers using DenseGeneral ---
@@ -272,29 +172,25 @@ class Attention(nn.Module):
             in_shapes=(q_embed_dim,),
             out_features=(num_query_heads, head_dim),
             axis=(-1,),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
         self.k_proj = DenseGeneral(
             in_shapes=(kv_embed_dim,),
             out_features=(num_kv_heads, head_dim),
             axis=(-1,),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
         self.v_proj = DenseGeneral(
             in_shapes=(kv_embed_dim,),
             out_features=(num_kv_heads, head_dim),
             axis=(-1,),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
         self.o_proj = DenseGeneral(
             in_shapes=(num_query_heads, head_dim),
             out_features=(self.output_dim,),
             axis=(-2, -1),
-            dtype=compute_dtype,
-            weight_dtype=weight_dtype,
         )
         # --- Rotary Embedding ---
@@ -311,10 +207,11 @@ class Attention(nn.Module):
         Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
         q_positions: torch.Tensor,  # (B, T)
         kv_positions: torch.Tensor | None = None,  # (B, S)
-        deterministic: bool = True,
-        attn_mask: torch.Tensor | None = None,  # None in Decoder Self Attention, Valid mask in Others
         cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
-        prefill: bool = False,  # True only when prefilling KV Cache
     ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         """
         Performs attention calculation with optional KV caching.
@@ -324,7 +221,6 @@ class Attention(nn.Module):
             Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
             q_positions: Positions for queries (B, T).
             kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
-            deterministic: If True, disable dropout.
             attn_mask: Attention mask.
             cache: KVCache.
             prefill: If True, use prefill mode.
@@ -342,72 +238,51 @@ class Attention(nn.Module):
         Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
         Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
-        # Input values into attention calculation
         attn_k: torch.Tensor | None = None
         attn_v: torch.Tensor | None = None
-        new_kv_cache: tuple[torch.Tensor, torch.Tensor] | None = None
-        # Decoder Cross Attention
         if self.is_cross_attn:
-            # Directly use cache (no need to check index)
             attn_k, attn_v = cache.k, cache.v
-            if attn_k.shape[1] != self.num_query_heads or attn_v.shape[1] != self.num_query_heads:
-                raise ValueError(
-                    f"Cross-attention cache head dimension ({attn_k.shape[1]}) "
-                    f"does not match num_query_heads ({self.num_query_heads}). "
-                    "Cache should be pre-repeated for GQA."
-                )
-        # Self Attention
         else:
             Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
             Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
-            Xk_BxSxKxH = self.rotary_emb(Xk_BxSxKxH, position=kv_positions)  # (B, S, K, H)
             Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
             Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
-            # S=1 for Decode Step
-            if self.num_gqa_groups > 1:
-                Xk_BxNxSxH = Xk_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
-                Xv_BxNxSxH = Xv_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
-            else:
-                Xk_BxNxSxH = Xk_BxKxSxH
-                Xv_BxNxSxH = Xv_BxKxSxH
-            # Encoder Self Attention
             if cache is None:
-                attn_k = Xk_BxNxSxH
-                attn_v = Xv_BxNxSxH
-            # Decoder Self Attention
             else:
-                # In prefill mode, we fill in cache until prefill length
                 if prefill:
-                    attn_k, attn_v = Xk_BxNxSxH, Xv_BxNxSxH
-                    cache.prefill_kv(attn_k, attn_v)
-                # In decode step, we add current K/V to cache step by step
                 else:
-                    new_kv_cache = Xk_BxNxSxH, Xv_BxNxSxH
-                    attn_k, attn_v = cache.get_kv_for_attention(Xk_BxNxSxH, Xv_BxNxSxH)
         attn_output = F.scaled_dot_product_attention(
             Xq_BxNxTxH,
             attn_k,
             attn_v,
             attn_mask=attn_mask,
-            dropout_p=self.dropout_rate if not deterministic else 0.0,
             scale=1.0,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
         output = self.o_proj(attn_output)
-        return output.to(original_dtype), new_kv_cache
 class EncoderLayer(nn.Module):
     """Transformer Encoder Layer using DenseGeneral."""
-    def __init__(self, config: DiaConfig):
         super().__init__()
         self.config = config
         model_config = config.model
@@ -420,13 +295,13 @@ class EncoderLayer(nn.Module):
             dtype=torch.float32,
         )
         self.self_attention = Attention(
-            config=config,
             q_embed_dim=embed_dim,
             kv_embed_dim=embed_dim,
             num_query_heads=enc_config.n_head,
             num_kv_heads=enc_config.n_head,
             head_dim=enc_config.head_dim,
-            dropout_rate=model_config.dropout,
             is_cross_attn=False,
             out_embed_dim=embed_dim,
         )
@@ -436,62 +311,52 @@ class EncoderLayer(nn.Module):
             dtype=torch.float32,
         )
         self.mlp = MlpBlock(
-            config=config,
             embed_dim=embed_dim,
             intermediate_dim=enc_config.n_hidden,
-            activations=enc_config.mlp_activations,
-            dropout_rate=model_config.dropout,
-            use_pre_norm=enc_config.use_pre_norm,
         )
-        self.dropout = nn.Dropout(model_config.dropout)
     def forward(
         self,
         x: torch.Tensor,
-        src_positions: torch.Tensor | None = None,
-        deterministic: bool = True,
-        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         residual = x
         x_norm = self.pre_sa_norm(x)
-        sa_out, _ = self.self_attention(
             Xq=x_norm,
             Xkv=x_norm,
-            q_positions=src_positions,
-            kv_positions=src_positions,
-            deterministic=deterministic,
-            attn_mask=attn_mask,
         )
         x = residual + sa_out
         residual = x
         x_norm = self.post_sa_norm(x)
-        mlp_out = self.mlp(x_norm, deterministic=deterministic)
         x = residual + mlp_out
-        if not deterministic:
-            x = self.dropout(x)
         return x
 class Encoder(nn.Module):
     """Transformer Encoder Stack using DenseGeneral."""
-    def __init__(self, config: DiaConfig):
         super().__init__()
         self.config = config
         model_config = config.model
         enc_config = config.model.encoder
-        compute_dtype = _str_to_dtype(config.training.dtype)
         self.embedding = nn.Embedding(
             model_config.src_vocab_size,
             enc_config.n_embd,
             dtype=compute_dtype,
         )
-        self.dropout = nn.Dropout(model_config.dropout)
-        self.layers = nn.ModuleList([EncoderLayer(config=config) for _ in range(enc_config.n_layer)])
         self.norm = RMSNorm(
             enc_config.n_embd,
             eps=model_config.normalization_layer_epsilon,
@@ -501,32 +366,21 @@ class Encoder(nn.Module):
     def forward(
         self,
         x_ids: torch.Tensor,
-        src_positions: torch.Tensor | None = None,
-        deterministic: bool = True,
-        attn_mask: torch.Tensor | None = None,
     ) -> torch.Tensor:
         x = self.embedding(x_ids)
-        if not deterministic:
-            x = self.dropout(x)
         for layer in self.layers:
-            x = layer(
-                x,
-                src_positions=src_positions,
-                deterministic=deterministic,
-                attn_mask=attn_mask,
-            )
         x = self.norm(x)
-        if not deterministic:
-            x = self.dropout(x)
         return x
 class DecoderLayer(nn.Module):
     """Transformer Decoder Layer using DenseGeneral."""
-    def __init__(self, config: DiaConfig):
         super().__init__()
         self.config = config
         model_config = config.model
@@ -554,13 +408,13 @@ class DecoderLayer(nn.Module):
         # Self-Attention (GQA) with Causal Masking
         self.self_attention = Attention(
-            config=config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=dec_embed_dim,
             num_query_heads=dec_config.gqa_query_heads,
             num_kv_heads=dec_config.kv_heads,
             head_dim=dec_config.gqa_head_dim,
-            dropout_rate=model_config.dropout,
             is_cross_attn=False,
             out_embed_dim=dec_embed_dim,
         )
@@ -572,116 +426,105 @@ class DecoderLayer(nn.Module):
             num_query_heads=dec_config.cross_query_heads,
             num_kv_heads=dec_config.cross_query_heads,
             head_dim=dec_config.cross_head_dim,
-            dropout_rate=model_config.dropout,
             is_cross_attn=True,
             out_embed_dim=dec_embed_dim,
         )
         # MLP
         self.mlp = MlpBlock(
-            config=config,
             embed_dim=dec_embed_dim,
             intermediate_dim=dec_config.n_hidden,
-            activations=dec_config.mlp_activations,
-            dropout_rate=model_config.dropout,
-            use_pre_norm=dec_config.use_pre_norm,
         )
     def forward(
         self,
         x: torch.Tensor,
-        encoder_out: torch.Tensor,
-        tgt_positions: torch.Tensor,
-        src_positions: torch.Tensor | None,
-        deterministic: bool,
-        self_attn_mask: torch.Tensor,
-        cross_attn_mask: torch.Tensor,
-        self_attn_cache: KVCache,
-        cross_attn_cache: KVCache,
         prefill: bool = False,
     ) -> torch.Tensor:
         residual = x
         x_norm = self.pre_sa_norm(x)
-        sa_out, new_kv_cache = self.self_attention(
             Xq=x_norm,  # (2, 1, D)
             Xkv=x_norm,  # (2, 1, D)
-            q_positions=tgt_positions,  # (2, 1)
-            kv_positions=tgt_positions,  # (2, 1)
-            deterministic=deterministic,
-            attn_mask=self_attn_mask,  # (2, 1, 1, S_max)
             cache=self_attn_cache,
             prefill=prefill,
         )
         x = residual + sa_out
-        # 2. Cross-Attention
         residual = x
         x_norm = self.pre_ca_norm(x)
-        ca_out, _ = self.cross_attention(
             Xq=x_norm,
-            Xkv=encoder_out,
-            q_positions=tgt_positions,
-            kv_positions=src_positions,
-            deterministic=deterministic,
-            attn_mask=cross_attn_mask,
             cache=cross_attn_cache,
         )
         x = residual + ca_out
-        # 3. MLP
         residual = x
         x_norm = self.pre_mlp_norm(x)
-        mlp_out = self.mlp(x_norm, deterministic=deterministic)
         x = residual + mlp_out
-        return x, new_kv_cache
 class Decoder(nn.Module):
     """Transformer Decoder Stack using DenseGeneral."""
-    def __init__(self, config: DiaConfig):
         super().__init__()
         self.config = config
         model_config = config.model
         dec_config = config.model.decoder
-        train_config = config.training
         data_config = config.data
-        compute_dtype = _str_to_dtype(config.training.dtype)
-        weight_dtype = _str_to_dtype(config.model.weight_dtype)
         self.num_channels = data_config.channels
         self.num_layers = dec_config.n_layer
         self.embeddings = nn.ModuleList(
             [
-                nn.Embedding(model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype)
                 for _ in range(self.num_channels)
             ]
         )
-        self.dropout = nn.Dropout(model_config.dropout)
-        self.layers = nn.ModuleList([DecoderLayer(config=config) for _ in range(self.num_layers)])
         self.norm = RMSNorm(
             dec_config.n_embd,
             eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
-        # Final Logits Projection using DenseGeneral
         self.logits_dense = DenseGeneral(
             in_shapes=(dec_config.n_embd,),
             out_features=(self.num_channels, model_config.tgt_vocab_size),
             axis=(-1,),
-            dtype=(torch.float32 if train_config.logits_dot_in_fp32 else compute_dtype),
-            weight_dtype=weight_dtype,
         )
-        self.logits_in_fp32 = train_config.logits_dot_in_fp32
-    def precompute_cross_attention_kv(
         self,
-        max_len: int,
-        encoder_out: torch.Tensor,  # (B, S, E)
-        src_positions: torch.Tensor | None,  # (B, S)
     ) -> list[KVCache]:
         """
         Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
@@ -690,35 +533,21 @@ class Decoder(nn.Module):
         for layer in self.layers:
             cross_attn_module = layer.cross_attention
-            k_proj = cross_attn_module.k_proj(encoder_out)
-            v_proj = cross_attn_module.v_proj(encoder_out)
-            k_proj = cross_attn_module.rotary_emb(k_proj, position=src_positions)
             k = k_proj.transpose(1, 2)
             v = v_proj.transpose(1, 2)
-            per_layer_kv_cache.append(
-                KVCache(
-                    cross_attn_module.num_kv_heads,
-                    max_len,
-                    cross_attn_module.head_dim,
-                    k.device,
-                    k=k,
-                    v=v,
-                )
-            )
         return per_layer_kv_cache
     def decode_step(
         self,
         tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
-        tgt_pos_Bx1: torch.Tensor,  # [B, 1]
-        encoder_out: torch.Tensor,  # [B, S, E]
-        self_attn_mask: Any,  # None
-        cross_attn_mask: torch.Tensor,  # [B, 1, 1, S]
-        self_attention_cache: list[KVCache],
-        cross_attention_cache: list[KVCache],
     ) -> torch.Tensor:
         """
         Performs a single decoding step, managing KV caches layer by layer.
@@ -727,7 +556,6 @@ class Decoder(nn.Module):
             A tuple containing:
             - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
         """
-        assert self_attn_mask is None, "Self-attention mask should be None, kept for pattern"
         x = None
         for i in range(self.num_channels):
@@ -735,40 +563,23 @@ class Decoder(nn.Module):
             channel_embed = self.embeddings[i](channel_tokens)
             x = channel_embed if x is None else x + channel_embed
-        new_cache = []
         for i, layer in enumerate(self.layers):
-            self_cache = self_attention_cache[i]
-            cross_cache = cross_attention_cache[i]
-            x, new_kv_cache = layer(
                 x,  # (2, 1, D)
-                encoder_out,  # (2, S, E)
-                src_positions=None,  # CA KV is already computed
-                tgt_positions=tgt_pos_Bx1,  # (2, 1)
-                deterministic=True,
-                self_attn_mask=None,
-                cross_attn_mask=cross_attn_mask,
                 self_attn_cache=self_cache,
                 cross_attn_cache=cross_cache,
             )
-            new_cache.append(new_kv_cache)
         x = self.norm(x)
         logits_Bx1xCxV = self.logits_dense(x)
-        return logits_Bx1xCxV.to(torch.float32), new_cache
     def forward(
-        self,
-        tgt_ids_BxTxC: torch.Tensor,
-        encoder_out: torch.Tensor,
-        tgt_positions: torch.Tensor,
-        src_positions: torch.Tensor,
-        deterministic: bool,
-        self_attn_mask: torch.Tensor,
-        cross_attn_mask: torch.Tensor,
-        self_attention_cache: list[KVCache],
-        cross_attention_cache: list[KVCache],
     ) -> torch.Tensor:
         """
         Forward pass for the Decoder stack, managing KV caches.
@@ -778,7 +589,6 @@ class Decoder(nn.Module):
             encoder_out: Output from the encoder (B, S, E).
             tgt_positions: Positions for target sequence (B, T).
             src_positions: Positions for source sequence (B, S).
-            deterministic: Disable dropout if True.
             self_attn_mask: Mask for self-attention.
             cross_attn_mask: Mask for cross-attention.
             past_key_values: List containing the self-attention KV cache for each layer
@@ -804,20 +614,14 @@ class Decoder(nn.Module):
             channel_embed = self.embeddings[i](channel_tokens)
             x = channel_embed if x is None else x + channel_embed
-        if not deterministic:
-            x = self.dropout(x)
         for i, layer in enumerate(self.layers):
-            x, _ = layer(
                 x,
-                encoder_out,
-                tgt_positions=tgt_positions,
-                src_positions=src_positions,
-                deterministic=deterministic,
-                self_attn_mask=self_attn_mask,
-                cross_attn_mask=cross_attn_mask,
-                self_attn_cache=self_attention_cache[i],
-                cross_attn_cache=cross_attention_cache[i],
                 prefill=True,
             )
@@ -831,43 +635,8 @@ class Decoder(nn.Module):
 class DiaModel(nn.Module):
     """PyTorch Dia Model using DenseGeneral."""
-    def __init__(self, config: DiaConfig):
         super().__init__()
         self.config = config
-        self.encoder = Encoder(config)
-        self.decoder = Decoder(config)
-    def forward(
-        self,
-        src_BxS: torch.Tensor,
-        tgt_BxTxC: torch.Tensor,
-        src_positions: torch.Tensor | None = None,
-        tgt_positions: torch.Tensor | None = None,
-        enc_self_attn_mask: torch.Tensor | None = None,
-        dec_self_attn_mask: torch.Tensor | None = None,
-        dec_cross_attn_mask: torch.Tensor | None = None,
-        enable_dropout: bool = True,
-    ):
-        deterministic = not enable_dropout
-        # --- Encoder Pass ---
-        encoder_out = self.encoder(
-            x_ids=src_BxS,
-            src_positions=src_positions,
-            deterministic=deterministic,
-            attn_mask=enc_self_attn_mask,
-        )
-        # --- Decoder Pass ---
-        logits, _ = self.decoder(
-            tgt_ids_BxTxC=tgt_BxTxC,
-            encoder_out=encoder_out,
-            tgt_positions=tgt_positions,
-            src_positions=src_positions,
-            deterministic=deterministic,
-            self_attn_mask=dec_self_attn_mask,
-            cross_attn_mask=dec_cross_attn_mask,
-            precomputed_cross_attn_kv=None,
-        )
-        return logits

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import RMSNorm
 from .config import DiaConfig
+from .state import DecoderInferenceState, EncoderInferenceState, KVCache
 def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
     return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
 class DenseGeneral(nn.Module):
     """
     PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
         in_shapes: tuple[int, ...],
         out_features: tuple[int, ...],
         axis: tuple[int, ...] = (-1,),
         weight_dtype: torch.dtype | None = None,
         device: torch.device | None = None,
     ):
         self.in_shapes = in_shapes
         self.out_features = out_features
         self.axis = axis
         self.kernel_shape = self.in_shapes + self.out_features
         factory_kwargs = {"device": device, "dtype": weight_dtype}
         kernel_contract_axes = tuple(range(len(norm_axis)))
         output = torch.tensordot(
+            inputs.to(self.weight.dtype),
+            self.weight,
             dims=(norm_axis, kernel_contract_axes),
         ).to(inputs.dtype)
         return output
 class MlpBlock(nn.Module):
     """MLP block using DenseGeneral."""
     def __init__(
+        self, embed_dim: int, intermediate_dim: int, compute_dtype: torch.dtype
     ):
         super().__init__()
         self.dtype = compute_dtype
         self.wi_fused = DenseGeneral(
             in_shapes=(embed_dim,),
+            out_features=(2, intermediate_dim),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
         self.wo = DenseGeneral(
             in_shapes=(intermediate_dim,),
             out_features=(embed_dim,),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Forward pass."""
         fused_x = self.wi_fused(x)
+        gate = fused_x[..., 0, :]
+        up = fused_x[..., 1, :]
+        hidden = torch.mul(F.silu(gate), up).to(self.dtype)
         output = self.wo(hidden)
         return output
         return torch.cat((first_part, second_part), dim=-1)
 class Attention(nn.Module):
     """Attention using DenseGeneral."""
         num_query_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        compute_dtype: torch.dtype,
         is_cross_attn: bool = False,
         out_embed_dim: int | None = None,
     ):
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
         self.is_cross_attn = is_cross_attn
         self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
         self.projected_query_dim = num_query_heads * head_dim
         if num_query_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})"
+            )
         self.num_gqa_groups = num_query_heads // num_kv_heads
         # --- Projection Layers using DenseGeneral ---
             in_shapes=(q_embed_dim,),
             out_features=(num_query_heads, head_dim),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
         self.k_proj = DenseGeneral(
             in_shapes=(kv_embed_dim,),
             out_features=(num_kv_heads, head_dim),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
         self.v_proj = DenseGeneral(
             in_shapes=(kv_embed_dim,),
             out_features=(num_kv_heads, head_dim),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
         self.o_proj = DenseGeneral(
             in_shapes=(num_query_heads, head_dim),
             out_features=(self.output_dim,),
             axis=(-2, -1),
+            weight_dtype=compute_dtype,
         )
         # --- Rotary Embedding ---
         Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
         q_positions: torch.Tensor,  # (B, T)
         kv_positions: torch.Tensor | None = None,  # (B, S)
+        attn_mask: torch.Tensor
+        | None = None,  # None in Decoder Self Attention, Valid mask in Others
         cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
+        prefill: bool = False,
+        is_causal: bool = False,
     ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         """
         Performs attention calculation with optional KV caching.
             Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
             q_positions: Positions for queries (B, T).
             kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
             attn_mask: Attention mask.
             cache: KVCache.
             prefill: If True, use prefill mode.
         Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
         Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
         attn_k: torch.Tensor | None = None
         attn_v: torch.Tensor | None = None
         if self.is_cross_attn:
             attn_k, attn_v = cache.k, cache.v
         else:
             Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
             Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
+            Xk_BxSxKxH = self.rotary_emb(
+                Xk_BxSxKxH, position=kv_positions
+            )  # (B, S, K, H)
             Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
             Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
             if cache is None:
+                attn_k = Xk_BxKxSxH
+                attn_v = Xv_BxKxSxH
             else:
                 if prefill:
+                    attn_k, attn_v = Xk_BxKxSxH, Xv_BxKxSxH
+                    cache.prefill(attn_k, attn_v)
                 else:
+                    attn_k, attn_v = cache.update(Xk_BxKxSxH, Xv_BxKxSxH)
         attn_output = F.scaled_dot_product_attention(
             Xq_BxNxTxH,
             attn_k,
             attn_v,
             attn_mask=attn_mask,
             scale=1.0,
+            enable_gqa=self.num_gqa_groups > 1,
+            is_causal=is_causal,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
         output = self.o_proj(attn_output)
+        return output.to(original_dtype)
 class EncoderLayer(nn.Module):
     """Transformer Encoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
         model_config = config.model
             dtype=torch.float32,
         )
         self.self_attention = Attention(
+            config,
             q_embed_dim=embed_dim,
             kv_embed_dim=embed_dim,
             num_query_heads=enc_config.n_head,
             num_kv_heads=enc_config.n_head,
             head_dim=enc_config.head_dim,
+            compute_dtype=compute_dtype,
             is_cross_attn=False,
             out_embed_dim=embed_dim,
         )
             dtype=torch.float32,
         )
         self.mlp = MlpBlock(
             embed_dim=embed_dim,
             intermediate_dim=enc_config.n_hidden,
+            compute_dtype=compute_dtype,
         )
     def forward(
         self,
         x: torch.Tensor,
+        state: EncoderInferenceState,
     ) -> torch.Tensor:
         residual = x
         x_norm = self.pre_sa_norm(x)
+        sa_out = self.self_attention(
             Xq=x_norm,
             Xkv=x_norm,
+            q_positions=state.positions,
+            kv_positions=state.positions,
+            attn_mask=state.attn_mask,
         )
         x = residual + sa_out
         residual = x
         x_norm = self.post_sa_norm(x)
+        mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
         return x
 class Encoder(nn.Module):
     """Transformer Encoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
         model_config = config.model
         enc_config = config.model.encoder
         self.embedding = nn.Embedding(
             model_config.src_vocab_size,
             enc_config.n_embd,
             dtype=compute_dtype,
         )
+        self.layers = nn.ModuleList(
+            [EncoderLayer(config, compute_dtype) for _ in range(enc_config.n_layer)]
+        )
         self.norm = RMSNorm(
             enc_config.n_embd,
             eps=model_config.normalization_layer_epsilon,
     def forward(
         self,
         x_ids: torch.Tensor,
+        state: EncoderInferenceState,
     ) -> torch.Tensor:
         x = self.embedding(x_ids)
         for layer in self.layers:
+            x = layer(x, state)
         x = self.norm(x)
         return x
 class DecoderLayer(nn.Module):
     """Transformer Decoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
         model_config = config.model
         # Self-Attention (GQA) with Causal Masking
         self.self_attention = Attention(
+            config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=dec_embed_dim,
             num_query_heads=dec_config.gqa_query_heads,
             num_kv_heads=dec_config.kv_heads,
             head_dim=dec_config.gqa_head_dim,
+            compute_dtype=compute_dtype,
             is_cross_attn=False,
             out_embed_dim=dec_embed_dim,
         )
             num_query_heads=dec_config.cross_query_heads,
             num_kv_heads=dec_config.cross_query_heads,
             head_dim=dec_config.cross_head_dim,
+            compute_dtype=compute_dtype,
             is_cross_attn=True,
             out_embed_dim=dec_embed_dim,
         )
         # MLP
         self.mlp = MlpBlock(
             embed_dim=dec_embed_dim,
             intermediate_dim=dec_config.n_hidden,
+            compute_dtype=compute_dtype,
         )
     def forward(
         self,
         x: torch.Tensor,
+        state: DecoderInferenceState,
+        self_attn_cache: KVCache | None = None,
+        cross_attn_cache: KVCache | None = None,
         prefill: bool = False,
     ) -> torch.Tensor:
         residual = x
         x_norm = self.pre_sa_norm(x)
+        sa_out = self.self_attention(
             Xq=x_norm,  # (2, 1, D)
             Xkv=x_norm,  # (2, 1, D)
+            q_positions=state.dec_positions,  # (2, 1)
+            kv_positions=state.dec_positions,  # (2, 1)
+            attn_mask=None,
             cache=self_attn_cache,
             prefill=prefill,
+            is_causal=prefill,
         )
         x = residual + sa_out
         residual = x
         x_norm = self.pre_ca_norm(x)
+        ca_out = self.cross_attention(
             Xq=x_norm,
+            Xkv=state.enc_out,
+            q_positions=state.dec_positions,
+            kv_positions=state.enc_positions,
+            attn_mask=state.dec_cross_attn_mask,
             cache=cross_attn_cache,
         )
         x = residual + ca_out
         residual = x
         x_norm = self.pre_mlp_norm(x)
+        mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
+        return x
 class Decoder(nn.Module):
     """Transformer Decoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
         model_config = config.model
         dec_config = config.model.decoder
         data_config = config.data
         self.num_channels = data_config.channels
         self.num_layers = dec_config.n_layer
         self.embeddings = nn.ModuleList(
             [
+                nn.Embedding(
+                    model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype
+                )
                 for _ in range(self.num_channels)
             ]
         )
+        self.layers = nn.ModuleList(
+            [
+                DecoderLayer(config=config, compute_dtype=compute_dtype)
+                for _ in range(self.num_layers)
+            ]
+        )
         self.norm = RMSNorm(
             dec_config.n_embd,
             eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         self.logits_dense = DenseGeneral(
             in_shapes=(dec_config.n_embd,),
             out_features=(self.num_channels, model_config.tgt_vocab_size),
             axis=(-1,),
+            weight_dtype=compute_dtype,
         )
+    def precompute_cross_attn_cache(
         self,
+        enc_out: torch.Tensor,  # (B, S, E)
+        enc_positions: torch.Tensor,  # (B, S)
     ) -> list[KVCache]:
         """
         Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
         for layer in self.layers:
             cross_attn_module = layer.cross_attention
+            k_proj = cross_attn_module.k_proj(enc_out)
+            v_proj = cross_attn_module.v_proj(enc_out)
+            k_proj = cross_attn_module.rotary_emb(k_proj, position=enc_positions)
             k = k_proj.transpose(1, 2)
             v = v_proj.transpose(1, 2)
+            per_layer_kv_cache.append(KVCache.from_kv(k, v))
         return per_layer_kv_cache
     def decode_step(
         self,
         tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
+        state: DecoderInferenceState,
     ) -> torch.Tensor:
         """
         Performs a single decoding step, managing KV caches layer by layer.
             A tuple containing:
             - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
         """
         x = None
         for i in range(self.num_channels):
             channel_embed = self.embeddings[i](channel_tokens)
             x = channel_embed if x is None else x + channel_embed
         for i, layer in enumerate(self.layers):
+            self_cache = state.self_attn_cache[i]
+            cross_cache = state.cross_attn_cache[i]
+            x = layer(
                 x,  # (2, 1, D)
+                state,
                 self_attn_cache=self_cache,
                 cross_attn_cache=cross_cache,
             )
         x = self.norm(x)
         logits_Bx1xCxV = self.logits_dense(x)
+        return logits_Bx1xCxV.to(torch.float32)
     def forward(
+        self, tgt_ids_BxTxC: torch.Tensor, state: DecoderInferenceState
     ) -> torch.Tensor:
         """
         Forward pass for the Decoder stack, managing KV caches.
             encoder_out: Output from the encoder (B, S, E).
             tgt_positions: Positions for target sequence (B, T).
             src_positions: Positions for source sequence (B, S).
             self_attn_mask: Mask for self-attention.
             cross_attn_mask: Mask for cross-attention.
             past_key_values: List containing the self-attention KV cache for each layer
             channel_embed = self.embeddings[i](channel_tokens)
             x = channel_embed if x is None else x + channel_embed
         for i, layer in enumerate(self.layers):
+            self_cache = state.self_attn_cache[i]
+            cross_cache = state.cross_attn_cache[i]
+            x = layer(
                 x,
+                state,
+                self_attn_cache=self_cache,
+                cross_attn_cache=cross_cache,
                 prefill=True,
             )
 class DiaModel(nn.Module):
     """PyTorch Dia Model using DenseGeneral."""
+    def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
+        self.encoder = Encoder(config, compute_dtype)
+        self.decoder = Decoder(config, compute_dtype)

dia/model.py CHANGED Viewed

@@ -1,26 +1,46 @@
 import dac
 import numpy as np
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
-from .audio import audio_to_codebook, codebook_to_audio
 from .config import DiaConfig
-from .layers import DiaModel, KVCache
 def _sample_next_token(
     logits_BCxV: torch.Tensor,
     temperature: float,
     top_p: float,
-    use_cfg_filter: bool,
     cfg_filter_top_k: int | None = None,
 ) -> torch.Tensor:
     if temperature == 0.0:
         return torch.argmax(logits_BCxV, dim=-1)
     logits_BCxV = logits_BCxV / temperature
-    if use_cfg_filter and cfg_filter_top_k is not None:
         _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
         mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
         mask.scatter_(dim=-1, index=top_k_indices_BCxV, value=False)
@@ -28,17 +48,21 @@ def _sample_next_token(
     if top_p < 1.0:
         probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
-        sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(probs_BCxV, dim=-1, descending=True)
         cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
-        # Calculate indices to remove based on top_p
         sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
-        # Shift the mask to the right to keep the first token above the threshold
-        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[..., :-1].clone()
-        sorted_indices_to_remove_BCxV[..., 0] = 0  # Always keep the most probable token
         indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
-        indices_to_remove_BCxV.scatter_(dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV)
         logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
     final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
@@ -48,31 +72,61 @@ def _sample_next_token(
     return sampled_indices_C
 class Dia:
-    def __init__(self, config: DiaConfig, device: torch.device = torch.device("cuda")):
         """Initializes the Dia model.
         Args:
             config: The configuration object for the model.
-            device: The device to load the model onto.
         Raises:
             RuntimeError: If there is an error loading the DAC model.
         """
         super().__init__()
         self.config = config
-        self.device = device
-        self.model = DiaModel(config)
         self.dac_model = None
     @classmethod
-    def from_local(cls, config_path: str, checkpoint_path: str, device: torch.device = torch.device("cuda")) -> "Dia":
         """Loads the Dia model from local configuration and checkpoint files.
         Args:
             config_path: Path to the configuration JSON file.
             checkpoint_path: Path to the model checkpoint (.pth) file.
-            device: The device to load the model onto.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
@@ -85,23 +139,29 @@ class Dia:
         if config is None:
             raise FileNotFoundError(f"Config file not found at {config_path}")
-        dia = cls(config, device)
         try:
-            dia.model.load_state_dict(torch.load(checkpoint_path, map_location=device))
         except FileNotFoundError:
             raise FileNotFoundError(f"Checkpoint file not found at {checkpoint_path}")
         except Exception as e:
-            raise RuntimeError(f"Error loading checkpoint from {checkpoint_path}") from e
-        dia.model.to(device)
         dia.model.eval()
         dia._load_dac_model()
         return dia
     @classmethod
     def from_pretrained(
-        cls, model_name: str = "nari-labs/Dia-1.6B", device: torch.device = torch.device("cuda")
     ) -> "Dia":
         """Loads the Dia model from a Hugging Face Hub repository.
@@ -110,7 +170,7 @@ class Dia:
         Args:
             model_name: The Hugging Face Hub repository ID (e.g., "NariLabs/Dia-1.6B").
-            device: The device to load the model onto.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
@@ -121,7 +181,7 @@ class Dia:
         """
         config_path = hf_hub_download(repo_id=model_name, filename="config.json")
         checkpoint_path = hf_hub_download(repo_id=model_name, filename="dia-v0_1.pth")
-        return cls.from_local(config_path, checkpoint_path, device)
     def _load_dac_model(self):
         try:
@@ -131,44 +191,7 @@ class Dia:
             raise RuntimeError("Failed to load DAC model") from e
         self.dac_model = dac_model
-    def _create_attn_mask(
-        self,
-        q_padding_mask_1d: torch.Tensor,
-        k_padding_mask_1d: torch.Tensor,
-        is_causal: bool = False,
-    ) -> torch.Tensor:
-        """
-        Creates the attention mask (self or cross) mimicking JAX segment ID logic.
-        """
-        B1, Tq = q_padding_mask_1d.shape
-        B2, Tk = k_padding_mask_1d.shape
-        assert B1 == B2, "Query and key batch dimensions must match"
-        p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
-        p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
-        # Condition A: Non-padding query attends to non-padding key
-        non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
-        # Condition B: Padding query attends to padding key
-        pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
-        # Combine: True if padding status is compatible (both non-pad OR both pad)
-        # This implementation follows Jax TPU splash attention kernel
-        mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
-        if is_causal:
-            # Ensure causality for self-attention (Tq == Tk)
-            assert Tq == Tk, "Causal mask requires query and key sequence lengths to be equal"
-            # Standard lower-triangular causal mask (True means allow)
-            causal_mask_2d = torch.tril(torch.ones((Tq, Tk), dtype=torch.bool, device=self.device))  # Shape [Tq, Tk]
-            causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
-            return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk] for broadcasting across heads
-        else:
-            # For cross-attention or non-causal self-attention
-            return mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk] for broadcasting across heads
-    def _prepare_text_input(self, text: str) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Encodes text prompt, pads, and creates attention mask and positions."""
         text_pad_value = self.config.data.text_pad_value
         max_len = self.config.data.text_length
@@ -190,14 +213,168 @@ class Dia:
                 constant_values=text_pad_value,
             ).astype(np.uint8)
-        src_tokens = torch.from_numpy(padded_text_np).to(torch.long).to(self.device).unsqueeze(0)  # [1, S]
-        src_positions = torch.arange(max_len, device=self.device).to(torch.long).unsqueeze(0)  # [1, S]
-        src_padding_mask = (src_tokens != text_pad_value).to(self.device)  # [1, S]
-        enc_self_attn_mask = self._create_attn_mask(src_padding_mask, src_padding_mask, is_causal=False)  # [1, S, S]
-        return src_tokens, src_positions, src_padding_mask, enc_self_attn_mask
     @torch.inference_mode()
     def generate(
@@ -207,225 +384,105 @@ class Dia:
         cfg_scale: float = 3.0,
         temperature: float = 1.3,
         top_p: float = 0.95,
-        use_cfg_filter: bool = True,
-        use_torch_compile: bool = True,
-        cfg_filter_top_k: int = 100,
         audio_prompt_path: str | None = None,
     ) -> np.ndarray:
-        """
-        Generates audio from a text prompt (and optional audio prompt) using the Nari model.
-        Returns:
-            A tensor of generated audio codes (shape: [max_tokens, num_channels]).
-        """
-        num_channels = self.config.data.channels
-        audio_bos_value = self.config.data.audio_bos_value
         audio_eos_value = self.config.data.audio_eos_value
         audio_pad_value = self.config.data.audio_pad_value
         delay_pattern = self.config.data.delay_pattern
         max_tokens = self.config.data.audio_length if max_tokens is None else max_tokens
-        delay_tensor = torch.tensor(delay_pattern, dtype=torch.long, device=self.device)
         max_delay_pattern = max(delay_pattern)
         self.model.eval()
-        (
-            cond_src_BxS,
-            cond_src_positions_BxS,
-            cond_src_padding_mask_BxS,
-            cond_enc_self_attn_mask_Bx1xSxS,
-        ) = self._prepare_text_input(text)
-        unc_src_BxS = torch.zeros_like(cond_src_BxS)
-        src_BxS = torch.cat([unc_src_BxS, cond_src_BxS], dim=0)
-        src_positions_BxS = cond_src_positions_BxS.expand(2, -1)
-        src_padding_mask_BxS = cond_src_padding_mask_BxS.expand(2, -1)
-        enc_self_attn_mask_Bx1xSxS = cond_enc_self_attn_mask_Bx1xSxS.expand(2, -1, -1, -1)
-        # 2. Encoder Pass
-        # with torch.autocast(device_type="cuda", dtype=forward_dtype):
-        encoder_out = self.model.encoder(
-            x_ids=src_BxS,
-            src_positions=src_positions_BxS,
-            deterministic=True,
-            attn_mask=enc_self_attn_mask_Bx1xSxS,
-        )  # Shape: (B, S, E)
-        # 3. Prepare Decoder Inputs
-        # 3-1. Allocate KV Cache (Static)
-        decoder_cross_attention_cache: list[KVCache] = self.model.decoder.precompute_cross_attention_kv(
-            max_tokens, encoder_out, src_positions_BxS
-        )
-        decoder_self_attention_cache: list[KVCache] = []
-        for _ in range(self.model.decoder.num_layers):
-            decoder_self_attention_cache.append(
-                KVCache(
-                    self.config.model.decoder.gqa_query_heads,
-                    max_tokens,
-                    self.config.model.decoder.gqa_head_dim,
-                    self.device,
-                )
-            )
-        # 3-2. Initialize Decoder Inputs
-        generated_BxTxC = torch.full(
-            (2, 1, num_channels),
-            fill_value=audio_bos_value,
-            dtype=torch.long,
-            device=self.device,
-        )
-        current_step = 0
-        prompt_len_inc_bos = 1  # Start with BOS length
-        # 3-3. Load Audio Prompt (if provided)
-        if audio_prompt_path is not None:
-            audio_prompt, sr = torchaudio.load(audio_prompt_path, channels_first=True)  # C, T
-            if sr != 44100:  # Resample to 44.1kHz
-                audio_prompt = torchaudio.functional.resample(audio_prompt, sr, 44100)
-            audio_prompt = audio_prompt.to(self.device).unsqueeze(0)  # 1, C, T
-            audio_prompt = audio_to_codebook(self.dac_model, audio_prompt, data_config=self.config.data)
-            generated_BxTxC = torch.cat([generated_BxTxC, audio_prompt.expand(2, -1, -1)], dim=1)
-            prefill_len = generated_BxTxC.shape[1]
-            prompt_len_inc_bos = prefill_len
-            prefill_tgt_pos = torch.arange(prefill_len, device=self.device).unsqueeze(0).expand(2, -1)
-            prefill_tgt_padding_mask = (generated_BxTxC != audio_pad_value).any(dim=2)
-            prefill_self_attn_mask = self._create_attn_mask(
-                prefill_tgt_padding_mask,
-                prefill_tgt_padding_mask,
-                is_causal=True,
-            )
-            prefill_cross_attn_mask = self._create_attn_mask(
-                prefill_tgt_padding_mask,
-                src_padding_mask_BxS,
-                is_causal=False,
-            )
-            _ = self.model.decoder.forward(
-                tgt_ids_BxTxC=generated_BxTxC,
-                encoder_out=encoder_out,
-                tgt_positions=prefill_tgt_pos,
-                src_positions=src_positions_BxS,
-                deterministic=True,
-                self_attn_mask=prefill_self_attn_mask,
-                cross_attn_mask=prefill_cross_attn_mask,
-                self_attention_cache=decoder_self_attention_cache,
-                cross_attention_cache=decoder_cross_attention_cache,
-            )
-            current_step = prefill_len - 1
-        # 4. Autoregressive Generation Loop
-        eos_detected_channel_0 = False
         eos_countdown = -1
-        extra_steps_after_eos = 30
-        # Make generated_BxTxC a fixed size tensor
-        # Length is either 1 + max tokens or 1 + prompt len + max tokens
-        generated_BxTxC = torch.cat(
-            [
-                generated_BxTxC,
-                torch.full(
-                    (2, max_tokens, num_channels),
-                    fill_value=-1,
-                    dtype=torch.long,
-                    device=self.device,
-                ),
-            ],
-            dim=1,
-        )
-        decode_step = self.model.decoder.decode_step
         if use_torch_compile:
-            decode_step = torch.compile(
-                self.model.decoder.decode_step,
-                mode="default",
-            )
-        tgt_padding_mask = (
-            (generated_BxTxC[:, -1, :].unsqueeze(1) != audio_pad_value).any(dim=2).to(self.device)
-        )  # [B, 1]
-        # Generated tokens are never PAD, so we use fixed mask
-        decoder_cross_attn_mask = self._create_attn_mask(
-            tgt_padding_mask,  # Query mask [B, 1]
-            src_padding_mask_BxS,  # Key mask [B, S]
-            is_causal=False,
-        )  # [B, 1, 1, S]
-        for step in range(current_step, current_step + max_tokens):
-            tgt_ids_Bx1xC = generated_BxTxC[:, step, :].unsqueeze(1)
-            tgt_pos_Bx1 = torch.full(
-                (2, 1),
-                fill_value=step,
-                dtype=torch.long,
-                device=self.device,
-            )
-            logits_Bx1xCxV, new_cache = decode_step(
-                tgt_ids_Bx1xC=tgt_ids_Bx1xC,
-                tgt_pos_Bx1=tgt_pos_Bx1,
-                encoder_out=encoder_out,
-                self_attn_mask=None,
-                cross_attn_mask=decoder_cross_attn_mask,
-                self_attention_cache=decoder_self_attention_cache,
-                cross_attention_cache=decoder_cross_attention_cache,
             )
-            for i, layer_cache in enumerate(decoder_self_attention_cache):
-                layer_cache.update_cache(new_cache[i][0], new_cache[i][1])
-            V = self.config.model.tgt_vocab_size
-            logits_last_BxCxV = logits_Bx1xCxV[:, -1, :, :]  # B, C, V
-            uncond_logits_CxV = logits_last_BxCxV[0, :, :]
-            cond_logits_CxV = logits_last_BxCxV[1, :, :]
-            cfg_logits_CxV = cond_logits_CxV + cfg_scale * (cond_logits_CxV - uncond_logits_CxV)
-            logits_CxV = cfg_logits_CxV.reshape((-1, V))  # C, V
-            logits_CxV[:, 1025:] = -torch.inf
-            # Sample next token
-            pred_C = _sample_next_token(
-                logits_CxV.float(),
-                temperature=temperature,
-                top_p=top_p,
-                use_cfg_filter=use_cfg_filter,
-                cfg_filter_top_k=cfg_filter_top_k,
             )
-            generation_step_index = step - current_step
-            if audio_prompt_path is None:
-                pred_C = torch.where(
-                    generation_step_index >= delay_tensor,
-                    pred_C,
-                    audio_bos_value,
-                )
-            generated_BxTxC[:, step + 1, :] = pred_C.unsqueeze(0).expand(2, -1)
-            if not eos_detected_channel_0 and pred_C[0] == audio_eos_value:
-                eos_detected_channel_0 = True
-                eos_countdown = extra_steps_after_eos
             if eos_countdown > 0:
                 step_after_eos = max_delay_pattern - eos_countdown
                 for i, d in enumerate(delay_pattern):
                     if step_after_eos == d:
-                        generated_BxTxC[:, step + 1, i] = audio_eos_value
                     elif step_after_eos > d:
-                        generated_BxTxC[:, step + 1, i] = audio_pad_value
                 eos_countdown -= 1
-                if eos_countdown == 0:
-                    break
-            generation_step_index = step - current_step + 1
-        output_codes = generated_BxTxC[:, prompt_len_inc_bos : step + 1, :]
-        generated_codes = output_codes[0]
-        audio = codebook_to_audio(
-            generated_codes.transpose(1, 0), self.dac_model, delay_pattern, B=1, T=max_tokens, C=num_channels
-        )
-        return audio.squeeze().cpu().numpy()

+import time
+from enum import Enum
 import dac
 import numpy as np
 import torch
 import torchaudio
 from huggingface_hub import hf_hub_download
+from .audio import (
+    apply_audio_delay,
+    build_delay_indices,
+    build_revert_indices,
+    decode,
+    revert_audio_delay,
+)
 from .config import DiaConfig
+from .layers import DiaModel
+from .state import DecoderInferenceState, DecoderOutput, EncoderInferenceState
+DEFAULT_SAMPLE_RATE = 44100
+def _get_default_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
 def _sample_next_token(
     logits_BCxV: torch.Tensor,
     temperature: float,
     top_p: float,
     cfg_filter_top_k: int | None = None,
 ) -> torch.Tensor:
     if temperature == 0.0:
         return torch.argmax(logits_BCxV, dim=-1)
     logits_BCxV = logits_BCxV / temperature
+    if cfg_filter_top_k is not None:
         _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
         mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
         mask.scatter_(dim=-1, index=top_k_indices_BCxV, value=False)
     if top_p < 1.0:
         probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+        sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(
+            probs_BCxV, dim=-1, descending=True
+        )
         cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
         sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
+        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[
+            ..., :-1
+        ].clone()
+        sorted_indices_to_remove_BCxV[..., 0] = 0
         indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
+        indices_to_remove_BCxV.scatter_(
+            dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV
+        )
         logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
     final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
     return sampled_indices_C
+class ComputeDtype(str, Enum):
+    FLOAT32 = "float32"
+    FLOAT16 = "float16"
+    BFLOAT16 = "bfloat16"
+    def to_dtype(self) -> torch.dtype:
+        if self == ComputeDtype.FLOAT32:
+            return torch.float32
+        elif self == ComputeDtype.FLOAT16:
+            return torch.float16
+        elif self == ComputeDtype.BFLOAT16:
+            return torch.bfloat16
+        else:
+            raise ValueError(f"Unsupported compute dtype: {self}")
 class Dia:
+    def __init__(
+        self,
+        config: DiaConfig,
+        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
+        device: torch.device | None = None,
+    ):
         """Initializes the Dia model.
         Args:
             config: The configuration object for the model.
+            device: The device to load the model onto. If None, will automatically select the best available device.
         Raises:
             RuntimeError: If there is an error loading the DAC model.
         """
         super().__init__()
         self.config = config
+        self.device = device if device is not None else _get_default_device()
+        if isinstance(compute_dtype, str):
+            compute_dtype = ComputeDtype(compute_dtype)
+        self.compute_dtype = compute_dtype.to_dtype()
+        self.model = DiaModel(config, self.compute_dtype)
         self.dac_model = None
     @classmethod
+    def from_local(
+        cls,
+        config_path: str,
+        checkpoint_path: str,
+        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
+        device: torch.device | None = None,
+    ) -> "Dia":
         """Loads the Dia model from local configuration and checkpoint files.
         Args:
             config_path: Path to the configuration JSON file.
             checkpoint_path: Path to the model checkpoint (.pth) file.
+            device: The device to load the model onto. If None, will automatically select the best available device.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
         if config is None:
             raise FileNotFoundError(f"Config file not found at {config_path}")
+        dia = cls(config, compute_dtype, device)
         try:
+            state_dict = torch.load(checkpoint_path, map_location=dia.device)
+            dia.model.load_state_dict(state_dict)
         except FileNotFoundError:
             raise FileNotFoundError(f"Checkpoint file not found at {checkpoint_path}")
         except Exception as e:
+            raise RuntimeError(
+                f"Error loading checkpoint from {checkpoint_path}"
+            ) from e
+        dia.model.to(dia.device)
         dia.model.eval()
         dia._load_dac_model()
         return dia
     @classmethod
     def from_pretrained(
+        cls,
+        model_name: str = "nari-labs/Dia-1.6B",
+        compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
+        device: torch.device | None = None,
     ) -> "Dia":
         """Loads the Dia model from a Hugging Face Hub repository.
         Args:
             model_name: The Hugging Face Hub repository ID (e.g., "NariLabs/Dia-1.6B").
+            device: The device to load the model onto. If None, will automatically select the best available device.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
         """
         config_path = hf_hub_download(repo_id=model_name, filename="config.json")
         checkpoint_path = hf_hub_download(repo_id=model_name, filename="dia-v0_1.pth")
+        return cls.from_local(config_path, checkpoint_path, compute_dtype, device)
     def _load_dac_model(self):
         try:
             raise RuntimeError("Failed to load DAC model") from e
         self.dac_model = dac_model
+    def _prepare_text_input(self, text: str) -> torch.Tensor:
         """Encodes text prompt, pads, and creates attention mask and positions."""
         text_pad_value = self.config.data.text_pad_value
         max_len = self.config.data.text_length
                 constant_values=text_pad_value,
             ).astype(np.uint8)
+        src_tokens = (
+            torch.from_numpy(padded_text_np).to(torch.long).to(self.device).unsqueeze(0)
+        )  # [1, S]
+        return src_tokens
+    def _prepare_audio_prompt(
+        self, audio_prompt: torch.Tensor | None
+    ) -> tuple[torch.Tensor, int]:
+        num_channels = self.config.data.channels
+        audio_bos_value = self.config.data.audio_bos_value
+        audio_pad_value = self.config.data.audio_pad_value
+        delay_pattern = self.config.data.delay_pattern
+        max_delay_pattern = max(delay_pattern)
+        prefill = torch.full(
+            (1, num_channels),
+            fill_value=audio_bos_value,
+            dtype=torch.int,
+            device=self.device,
+        )
+        prefill_step = 1
+        if audio_prompt is not None:
+            prefill_step += audio_prompt.shape[0]
+            prefill = torch.cat([prefill, audio_prompt], dim=0)
+        delay_pad_tensor = torch.full(
+            (max_delay_pattern, num_channels),
+            fill_value=-1,
+            dtype=torch.int,
+            device=self.device,
+        )
+        prefill = torch.cat([prefill, delay_pad_tensor], dim=0)
+        delay_precomp = build_delay_indices(
+            B=1,
+            T=prefill.shape[0],
+            C=num_channels,
+            delay_pattern=delay_pattern,
+        )
+        prefill = apply_audio_delay(
+            audio_BxTxC=prefill.unsqueeze(0),
+            pad_value=audio_pad_value,
+            bos_value=audio_bos_value,
+            precomp=delay_precomp,
+        ).squeeze(0)
+        return prefill, prefill_step
+    def _prepare_generation(
+        self, text: str, audio_prompt: str | torch.Tensor | None, verbose: bool
+    ):
+        enc_input_cond = self._prepare_text_input(text)
+        enc_input_uncond = torch.zeros_like(enc_input_cond)
+        enc_input = torch.cat([enc_input_uncond, enc_input_cond], dim=0)
+        if isinstance(audio_prompt, str):
+            audio_prompt = self.load_audio(audio_prompt)
+        prefill, prefill_step = self._prepare_audio_prompt(audio_prompt)
+        if verbose:
+            print("generate: data loaded")
+        enc_state = EncoderInferenceState.new(self.config, enc_input_cond)
+        encoder_out = self.model.encoder(enc_input, enc_state)
+        dec_cross_attn_cache = self.model.decoder.precompute_cross_attn_cache(
+            encoder_out, enc_state.positions
+        )
+        dec_state = DecoderInferenceState.new(
+            self.config,
+            enc_state,
+            encoder_out,
+            dec_cross_attn_cache,
+            self.compute_dtype,
+        )
+        dec_output = DecoderOutput.new(self.config, self.device)
+        dec_output.prefill(prefill, prefill_step)
+        dec_step = prefill_step - 1
+        if dec_step > 0:
+            dec_state.prepare_step(0, dec_step)
+            tokens_BxTxC = (
+                dec_output.get_tokens_at(0, dec_step).unsqueeze(0).expand(2, -1, -1)
+            )
+            self.model.decoder.forward(tokens_BxTxC, dec_state)
+        return dec_state, dec_output
+    def _decoder_step(
+        self,
+        tokens_Bx1xC: torch.Tensor,
+        dec_state: DecoderInferenceState,
+        cfg_scale: float,
+        temperature: float,
+        top_p: float,
+        cfg_filter_top_k: int,
+    ) -> torch.Tensor:
+        audio_eos_value = self.config.data.audio_eos_value
+        logits_Bx1xCxV = self.model.decoder.decode_step(tokens_Bx1xC, dec_state)
+        logits_last_BxCxV = logits_Bx1xCxV[:, -1, :, :]
+        uncond_logits_CxV = logits_last_BxCxV[0, :, :]
+        cond_logits_CxV = logits_last_BxCxV[1, :, :]
+        logits_CxV = cond_logits_CxV + cfg_scale * (cond_logits_CxV - uncond_logits_CxV)
+        logits_CxV[:, audio_eos_value + 1 :] = -torch.inf
+        logits_CxV[1:, audio_eos_value:] = -torch.inf
+        pred_C = _sample_next_token(
+            logits_CxV.float(),
+            temperature=temperature,
+            top_p=top_p,
+            cfg_filter_top_k=cfg_filter_top_k,
+        )
+        return pred_C
+    def _generate_output(self, generated_codes: torch.Tensor) -> np.ndarray:
+        num_channels = self.config.data.channels
+        seq_length = generated_codes.shape[0]
+        delay_pattern = self.config.data.delay_pattern
+        audio_pad_value = self.config.data.audio_pad_value
+        max_delay_pattern = max(delay_pattern)
+        revert_precomp = build_revert_indices(
+            B=1,
+            T=seq_length,
+            C=num_channels,
+            delay_pattern=delay_pattern,
+        )
+        codebook = revert_audio_delay(
+            audio_BxTxC=generated_codes.unsqueeze(0),
+            pad_value=audio_pad_value,
+            precomp=revert_precomp,
+            T=seq_length,
+        )[:, :-max_delay_pattern, :]
+        min_valid_index = 0
+        max_valid_index = 1023
+        invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
+        codebook[invalid_mask] = 0
+        audio = decode(self.dac_model, codebook.transpose(1, 2))
+        return audio.squeeze().cpu().numpy()
+    def load_audio(self, audio_path: str) -> torch.Tensor:
+        audio, sr = torchaudio.load(audio_path, channels_first=True)  # C, T
+        if sr != DEFAULT_SAMPLE_RATE:
+            audio = torchaudio.functional.resample(audio, sr, DEFAULT_SAMPLE_RATE)
+        audio = audio.to(self.device).unsqueeze(0)  # 1, C, T
+        audio_data = self.dac_model.preprocess(audio, DEFAULT_SAMPLE_RATE)
+        _, encoded_frame, _, _, _ = self.dac_model.encode(audio_data)  # 1, C, T
+        return encoded_frame.squeeze(0).transpose(0, 1)
+    def save_audio(self, path: str, audio: np.ndarray):
+        import soundfile as sf
+        sf.write(path, audio, DEFAULT_SAMPLE_RATE)
     @torch.inference_mode()
     def generate(
         cfg_scale: float = 3.0,
         temperature: float = 1.3,
         top_p: float = 0.95,
+        use_torch_compile: bool = False,
+        cfg_filter_top_k: int = 35,
+        audio_prompt: str | torch.Tensor | None = None,
         audio_prompt_path: str | None = None,
+        use_cfg_filter: bool | None = None,
+        verbose: bool = False,
     ) -> np.ndarray:
         audio_eos_value = self.config.data.audio_eos_value
         audio_pad_value = self.config.data.audio_pad_value
         delay_pattern = self.config.data.delay_pattern
         max_tokens = self.config.data.audio_length if max_tokens is None else max_tokens
         max_delay_pattern = max(delay_pattern)
         self.model.eval()
+        if audio_prompt_path:
+            print("Warning: audio_prompt_path is deprecated. Use audio_prompt instead.")
+            audio_prompt = audio_prompt_path
+        if use_cfg_filter is not None:
+            print("Warning: use_cfg_filter is deprecated.")
+        if verbose:
+            total_start_time = time.time()
+        dec_state, dec_output = self._prepare_generation(text, audio_prompt, verbose)
+        dec_step = dec_output.prefill_step - 1
+        bos_countdown = max_delay_pattern
+        eos_detected = False
         eos_countdown = -1
         if use_torch_compile:
+            step_fn = torch.compile(self._decoder_step, mode="default")
+        else:
+            step_fn = self._decoder_step
+        if verbose:
+            print("generate: starting generation loop")
+            if use_torch_compile:
+                print(
+                    "generate: by using use_torch_compile=True, the first step would take long"
+                )
+            start_time = time.time()
+        while dec_step < max_tokens:
+            dec_state.prepare_step(dec_step)
+            tokens_Bx1xC = (
+                dec_output.get_tokens_at(dec_step).unsqueeze(0).expand(2, -1, -1)
             )
+            pred_C = step_fn(
+                tokens_Bx1xC,
+                dec_state,
+                cfg_scale,
+                temperature,
+                top_p,
+                cfg_filter_top_k,
             )
+            if (
+                not eos_detected and pred_C[0] == audio_eos_value
+            ) or dec_step == max_tokens - max_delay_pattern - 1:
+                eos_detected = True
+                eos_countdown = max_delay_pattern
             if eos_countdown > 0:
                 step_after_eos = max_delay_pattern - eos_countdown
                 for i, d in enumerate(delay_pattern):
                     if step_after_eos == d:
+                        pred_C[i] = audio_eos_value
                     elif step_after_eos > d:
+                        pred_C[i] = audio_pad_value
                 eos_countdown -= 1
+            bos_countdown = max(0, bos_countdown - 1)
+            dec_output.update_one(pred_C, dec_step + 1, bos_countdown > 0)
+            if eos_countdown == 0:
+                break
+            dec_step += 1
+            if verbose and dec_step % 86 == 0:
+                duration = time.time() - start_time
+                print(
+                    f"generate step {dec_step}: speed={86 / duration:.3f} tokens/s, realtime factor={1 / duration:.3f}x"
+                )
+                start_time = time.time()
+        if dec_output.prefill_step >= dec_step + 1:
+            print("Warning: Nothing generated")
+            return None
+        generated_codes = dec_output.generated_tokens[
+            dec_output.prefill_step : dec_step + 1, :
+        ]
+        if verbose:
+            total_step = dec_step + 1 - dec_output.prefill_step
+            total_duration = time.time() - total_start_time
+            print(
+                f"generate: total step={total_step}, total duration={total_duration:.3f}s"
+            )
+        return self._generate_output(generated_codes)

dia/state.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from dataclasses import dataclass
+import torch
+from .config import DiaConfig
+def create_attn_mask(
+    q_padding_mask_1d: torch.Tensor,
+    k_padding_mask_1d: torch.Tensor,
+    device: torch.device,
+    is_causal: bool = False,
+) -> torch.Tensor:
+    """
+    Creates the attention mask (self or cross) mimicking JAX segment ID logic.
+    """
+    B1, Tq = q_padding_mask_1d.shape
+    B2, Tk = k_padding_mask_1d.shape
+    assert B1 == B2, "Query and key batch dimensions must match"
+    p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
+    p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
+    # Condition A: Non-padding query attends to non-padding key
+    non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
+    # Condition B: Padding query attends to padding key
+    pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
+    # Combine: True if padding status is compatible (both non-pad OR both pad)
+    mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
+    if is_causal:
+        assert Tq == Tk, (
+            "Causal mask requires query and key sequence lengths to be equal"
+        )
+        causal_mask_2d = torch.tril(
+            torch.ones((Tq, Tk), dtype=torch.bool, device=device)
+        )  # Shape [Tq, Tk]
+        causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
+        return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
+    else:
+        return mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
+@dataclass
+class EncoderInferenceState:
+    """Parameters specifically for encoder inference."""
+    max_seq_len: int
+    device: torch.device
+    positions: torch.Tensor
+    padding_mask: torch.Tensor
+    attn_mask: torch.Tensor
+    @classmethod
+    def new(cls, config: DiaConfig, cond_src: torch.Tensor) -> "EncoderInferenceState":
+        """Creates EtorchrInferenceParams from DiaConfig and a device."""
+        device = cond_src.device
+        positions = (
+            torch.arange(config.data.text_length, device=device)
+            .to(torch.long)
+            .unsqueeze(0)
+            .expand(2, -1)
+        )
+        padding_mask = (cond_src != config.data.text_pad_value).to(device).expand(2, -1)
+        attn_mask = create_attn_mask(
+            padding_mask, padding_mask, device, is_causal=False
+        )
+        return cls(
+            max_seq_len=config.data.text_length,
+            device=device,
+            positions=positions,
+            padding_mask=padding_mask,
+            attn_mask=attn_mask,
+        )
+class KVCache:
+    def __init__(
+        self,
+        num_heads: int,
+        max_len: int,
+        head_dim: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        k: torch.Tensor | None = None,
+        v: torch.Tensor | None = None,
+    ):
+        self.k = (
+            torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device)
+            if k is None
+            else k
+        )
+        self.v = (
+            torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device)
+            if v is None
+            else v
+        )
+        self.current_idx = torch.tensor(0)
+    @classmethod
+    def from_kv(cls, k: torch.Tensor, v: torch.Tensor) -> "KVCache":
+        return cls(
+            num_heads=k.shape[1],
+            max_len=k.shape[2],
+            head_dim=k.shape[3],
+            dtype=k.dtype,
+            device=k.device,
+            k=k,
+            v=v,
+        )
+    def update(
+        self, k: torch.Tensor, v: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
+        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
+        self.current_idx += 1
+        return self.k[:, :, : self.current_idx, :], self.v[:, :, : self.current_idx, :]
+    def prefill(
+        self, k: torch.Tensor, v: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        prefill_len = k.shape[2]
+        self.k[:, :, :prefill_len, :] = k
+        self.v[:, :, :prefill_len, :] = v
+        self.current_idx = prefill_len - 1
+@dataclass
+class DecoderInferenceState:
+    """Parameters specifically for decoder inference."""
+    device: torch.device
+    dtype: torch.dtype
+    enc_out: torch.Tensor
+    enc_positions: torch.Tensor
+    dec_positions: torch.Tensor
+    dec_cross_attn_mask: torch.Tensor
+    self_attn_cache: list[KVCache]
+    cross_attn_cache: list[KVCache]
+    @classmethod
+    def new(
+        cls,
+        config: DiaConfig,
+        enc_state: EncoderInferenceState,
+        enc_out: torch.Tensor,
+        dec_cross_attn_cache: list[KVCache],
+        compute_dtype: torch.dtype,
+    ) -> "DecoderInferenceState":
+        """Creates DecoderInferenceParams from DiaConfig and a device."""
+        device = enc_out.device
+        max_audio_len = config.data.audio_length
+        dec_positions = torch.full(
+            (2, 1), fill_value=0, dtype=torch.long, device=device
+        )
+        tgt_padding_mask = torch.ones((2, 1), dtype=torch.bool, device=device)
+        dec_cross_attn_mask = create_attn_mask(
+            tgt_padding_mask, enc_state.padding_mask, device, is_causal=False
+        )
+        self_attn_cache = [
+            KVCache(
+                config.model.decoder.kv_heads,
+                max_audio_len,
+                config.model.decoder.gqa_head_dim,
+                compute_dtype,
+                device,
+            )
+            for _ in range(config.model.decoder.n_layer)
+        ]
+        return cls(
+            device=device,
+            dtype=compute_dtype,
+            enc_out=enc_out,
+            enc_positions=enc_state.positions,
+            dec_positions=dec_positions,
+            dec_cross_attn_mask=dec_cross_attn_mask,
+            self_attn_cache=self_attn_cache,
+            cross_attn_cache=dec_cross_attn_cache,
+        )
+    def prepare_step(self, step_from: int, step_to: int | None = None) -> None:
+        if step_to is None:
+            step_to = step_from + 1
+        self.dec_positions = (
+            torch.arange(step_from, step_to, device=self.device)
+            .unsqueeze(0)
+            .expand(2, -1)
+        )
+@dataclass
+class DecoderOutput:
+    generated_tokens: torch.Tensor
+    prefill_step: int
+    @classmethod
+    def new(cls, config: DiaConfig, device: torch.device) -> "DecoderOutput":
+        max_audio_len = config.data.audio_length
+        return cls(
+            generated_tokens=torch.full(
+                (max_audio_len, config.data.channels),
+                fill_value=-1,
+                dtype=torch.int,
+                device=device,
+            ),
+            prefill_step=0,
+        )
+    def get_tokens_at(self, step_from: int, step_to: int | None = None) -> torch.Tensor:
+        if step_to is None:
+            step_to = step_from + 1
+        return self.generated_tokens[step_from:step_to, :]
+    def update_one(self, dec_out: torch.Tensor, step: int, apply_mask: bool = False):
+        if apply_mask:
+            mask = self.generated_tokens[step : step + 1, :] == -1
+            self.generated_tokens[step : step + 1, :] = torch.where(
+                mask, dec_out, self.generated_tokens[step : step + 1, :]
+            )
+        else:
+            self.generated_tokens[step : step + 1, :] = dec_out
+    def prefill(self, dec_out: torch.Tensor, prefill_step: int):
+        length = dec_out.shape[0]
+        self.generated_tokens[0:length, :] = dec_out
+        self.prefill_step = prefill_step