{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Austi\\anaconda3\\envs\\janus_env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Python version is above 3.10, patching the collections module.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Austi\\anaconda3\\envs\\janus_env\\lib\\site-packages\\transformers\\models\\auto\\image_processing_auto.py:590: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead\n", " warnings.warn(\n" ] } ], "source": [ "import gradio as gr\n", "import torch\n", "from transformers import AutoConfig, AutoModelForCausalLM\n", "from janus.models import MultiModalityCausalLM, VLChatProcessor\n", "from janus.utils.io import load_pil_images\n", "from demo.cam import generate_gradcam, AttentionGuidedCAM\n", "from captum.attr import LayerGradCam\n", "from PIL import Image\n", "from einops import rearrange\n", "\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os\n", "import time\n", "\n", "import torch.nn.functional as F\n", "from scipy.ndimage import filters\n", "from torch import nn\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Usage Class Token: True\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized: ['vision_model.vision_tower.cls_token']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized because the shapes did not match:\n", "- vision_model.vision_tower.pos_embed: found shape torch.Size([1, 576, 1024]) in the checkpoint and torch.Size([1, 577, 1024]) in the model instantiated\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n", "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.\n", "Some kwargs in processor config are unused and will not have any effect: num_image_tokens, sft_format, image_tag, ignore_id, add_special_token, mask_prompt. \n" ] } ], "source": [ "\n", "model_path = \"deepseek-ai/Janus-Pro-1B\"\n", "config = AutoConfig.from_pretrained(model_path)\n", "language_config = config.language_config\n", "language_config._attn_implementation = 'eager'\n", "vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,\n", " language_config=language_config,\n", " trust_remote_code=True,\n", " ignore_mismatched_sizes=True # Adding CLS token, will be handled manually\n", " )\n", "\n", "dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16\n", "# dtype = torch.bfloat32 if torch.cuda.is_available() else torch.float32\n", "\n", "if torch.cuda.is_available():\n", " vl_gpt = vl_gpt.to(dtype).cuda()\n", "else:\n", " # vl_gpt = vl_gpt.to(torch.float16)\n", " torch.set_default_device(\"mps\")\n", " vl_gpt = vl_gpt.to(dtype)\n", "\n", "vl_chat_processor = VLChatProcessor.from_pretrained(model_path)\n", "tokenizer = vl_chat_processor.tokenizer\n", "cuda_device = 'cuda' if torch.cuda.is_available() else 'mps'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CLIPVisionTower(\n", " (vision_tower): VisionTransformer(\n", " (patch_embed): PatchEmbed(\n", " (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n", " (norm): Identity()\n", " )\n", " (pos_drop): Dropout(p=0.0, inplace=False)\n", " (patch_drop): Identity()\n", " (norm_pre): Identity()\n", " (blocks): Sequential(\n", " (0): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (1): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (2): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (3): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (4): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (5): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (6): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (7): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (8): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (9): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (10): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (11): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (12): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (13): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (14): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (15): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (16): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (17): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (18): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (19): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (20): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (21): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (22): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (23): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " )\n", " (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn_pool): AttentionPoolLatent(\n", " (q): Linear(in_features=1024, out_features=1024, bias=True)\n", " (kv): Linear(in_features=1024, out_features=2048, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (fc_norm): Identity()\n", " (head_drop): Dropout(p=0.0, inplace=False)\n", " (head): Identity()\n", " )\n", ")\n" ] } ], "source": [ "print(vl_gpt.vision_model)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(102400, 2048)\n", " (layers): ModuleList(\n", " (0-23): 24 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (k_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (v_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n", " (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n", " )\n", " )\n", " (norm): LlamaRMSNorm((2048,), eps=1e-06)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (lm_head): Linear(in_features=2048, out_features=102400, bias=False)\n", ")\n" ] } ], "source": [ "print(vl_gpt.language_model)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MultiModalityCausalLM(\n", " (vision_model): CLIPVisionTower(\n", " (vision_tower): VisionTransformer(\n", " (patch_embed): PatchEmbed(\n", " (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n", " (norm): Identity()\n", " )\n", " (pos_drop): Dropout(p=0.0, inplace=False)\n", " (patch_drop): Identity()\n", " (norm_pre): Identity()\n", " (blocks): Sequential(\n", " (0): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (1): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (2): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (3): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (4): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (5): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (6): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (7): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (8): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (9): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (10): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (11): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (12): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (13): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (14): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (15): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (16): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (17): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (18): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (19): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (20): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (21): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (22): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " (23): Block(\n", " (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn): Attention(\n", " (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (attn_drop): Dropout(p=0.0, inplace=False)\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Identity()\n", " )\n", " (ls1): Identity()\n", " (drop_path1): Identity()\n", " (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " (ls2): Identity()\n", " (drop_path2): Identity()\n", " )\n", " )\n", " (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (attn_pool): AttentionPoolLatent(\n", " (q): Linear(in_features=1024, out_features=1024, bias=True)\n", " (kv): Linear(in_features=1024, out_features=2048, bias=True)\n", " (q_norm): Identity()\n", " (k_norm): Identity()\n", " (proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (proj_drop): Dropout(p=0.0, inplace=False)\n", " (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n", " (mlp): Mlp(\n", " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", " (act): GELU(approximate='none')\n", " (drop1): Dropout(p=0.0, inplace=False)\n", " (norm): Identity()\n", " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", " (drop2): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (fc_norm): Identity()\n", " (head_drop): Dropout(p=0.0, inplace=False)\n", " (head): Identity()\n", " )\n", " )\n", " (aligner): MlpProjector(\n", " (layers): Sequential(\n", " (0): Linear(in_features=1024, out_features=2048, bias=True)\n", " (1): GELU(approximate='none')\n", " (2): Linear(in_features=2048, out_features=2048, bias=True)\n", " )\n", " )\n", " (gen_vision_model): VQModel(\n", " (encoder): Encoder(\n", " (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (conv_blocks): ModuleList(\n", " (0-1): 2 x Module(\n", " (res): ModuleList(\n", " (0-1): 2 x ResnetBlock(\n", " (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (downsample): Downsample(\n", " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))\n", " )\n", " )\n", " (2): Module(\n", " (res): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (downsample): Downsample(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))\n", " )\n", " )\n", " (3): Module(\n", " (res): ModuleList(\n", " (0-1): 2 x ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (downsample): Downsample(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))\n", " )\n", " )\n", " (4): Module(\n", " (res): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList(\n", " (0-1): 2 x AttnBlock(\n", " (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " )\n", " )\n", " (mid): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " (1): AttnBlock(\n", " (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv_out): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " (decoder): Decoder(\n", " (conv_in): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (mid): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " (1): AttnBlock(\n", " (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (conv_blocks): ModuleList(\n", " (0): Module(\n", " (res): ModuleList(\n", " (0-2): 3 x ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList(\n", " (0-2): 3 x AttnBlock(\n", " (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsample): Upsample(\n", " (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (1): Module(\n", " (res): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n", " (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1-2): 2 x ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (upsample): Upsample(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (2): Module(\n", " (res): ModuleList(\n", " (0-2): 3 x ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (upsample): Upsample(\n", " (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (3): Module(\n", " (res): ModuleList(\n", " (0): ResnetBlock(\n", " (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n", " (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1-2): 2 x ResnetBlock(\n", " (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " (upsample): Upsample(\n", " (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (4): Module(\n", " (res): ModuleList(\n", " (0-2): 3 x ResnetBlock(\n", " (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " (attn): ModuleList()\n", " )\n", " )\n", " (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)\n", " (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " (quantize): VectorQuantizer(\n", " (embedding): Embedding(16384, 8)\n", " )\n", " (quant_conv): Conv2d(256, 8, kernel_size=(1, 1), stride=(1, 1))\n", " (post_quant_conv): Conv2d(8, 256, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (gen_aligner): MlpProjector(\n", " (layers): Sequential(\n", " (0): Linear(in_features=8, out_features=2048, bias=True)\n", " (1): GELU(approximate='none')\n", " (2): Linear(in_features=2048, out_features=2048, bias=True)\n", " )\n", " )\n", " (gen_head): vision_head(\n", " (output_mlp_projector): Linear(in_features=2048, out_features=2048, bias=True)\n", " (vision_activation): GELU(approximate='none')\n", " (vision_head): Linear(in_features=2048, out_features=16384, bias=True)\n", " )\n", " (gen_embed): Embedding(16384, 8)\n", " (language_model): LlamaForCausalLM(\n", " (model): LlamaModel(\n", " (embed_tokens): Embedding(102400, 2048)\n", " (layers): ModuleList(\n", " (0-23): 24 x LlamaDecoderLayer(\n", " (self_attn): LlamaAttention(\n", " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (k_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (v_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", " )\n", " (mlp): LlamaMLP(\n", " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n", " (act_fn): SiLU()\n", " )\n", " (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n", " (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n", " )\n", " )\n", " (norm): LlamaRMSNorm((2048,), eps=1e-06)\n", " (rotary_emb): LlamaRotaryEmbedding()\n", " )\n", " (lm_head): Linear(in_features=2048, out_features=102400, bias=False)\n", " )\n", ")\n" ] } ], "source": [ "print(vl_gpt)" ] } ], "metadata": { "kernelspec": { "display_name": "janus_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }