In [1]:
import gradio as gr
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
from demo.cam import generate_gradcam, AttentionGuidedCAM
from captum.attr import LayerGradCam
from PIL import Image
from einops import rearrange

import numpy as np
import matplotlib.pyplot as plt
import os
import time

import torch.nn.functional as F
from scipy.ndimage import filters
from torch import nn


 from .autonotebook import tqdm as notebook_tqdm


Python version is above 3.10, patching the collections module.




In [2]:

model_path = "deepseek-ai/Janus-Pro-1B"
config = AutoConfig.from_pretrained(model_path)
language_config = config.language_config
language_config._attn_implementation = 'eager'
vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
 language_config=language_config,
 trust_remote_code=True,
 ignore_mismatched_sizes=True # Adding CLS token, will be handled manually
 )

dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
# dtype = torch.bfloat32 if torch.cuda.is_available() else torch.float32

if torch.cuda.is_available():
 vl_gpt = vl_gpt.to(dtype).cuda()
else:
 # vl_gpt = vl_gpt.to(torch.float16)
 torch.set_default_device("mps")
 vl_gpt = vl_gpt.to(dtype)

vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
cuda_device = 'cuda' if torch.cuda.is_available() else 'mps'

Usage Class Token: True


Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized: ['vision_model.vision_tower.cls_token']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized because the shapes did not match:
- vision_model.vision_tower.pos_embed: found shape torch.Size([1, 576, 1024]) in the checkpoint and torch.Size([1, 577, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be 

In [3]:
print(vl_gpt.vision_model)

CLIPVisionTower(
 (vision_tower): VisionTransformer(
 (patch_embed): PatchEmbed(
 (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
 (norm): Identity()
 )
 (pos_drop): Dropout(p=0.0, inplace=False)
 (patch_drop): Identity()
 (norm_pre): Identity()
 (blocks): Sequential(
 (0): Block(
 (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
 (attn): Attention(
 (qkv): Linear(in_features=1024, out_features=3072, bias=True)
 (q_norm): Identity()
 (k_norm): Identity()
 (attn_drop): Dropout(p=0.0, inplace=False)
 (proj): Linear(in_features=1024, out_features=1024, bias=True)
 (proj_drop): Identity()
 )
 (ls1): Identity()
 (drop_path1): Identity()
 (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
 (mlp): Mlp(
 (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 (act): GELU(approximate='none')
 (drop1): Dropout(p=0.0, inplace=False)
 (norm): Identity()
 (fc2): Linear(in_features=4096, out_features=1024, bias=True)
 (drop2): Dropout(p=0.0, inplace=

In [4]:
print(vl_gpt.language_model)

LlamaForCausalLM(
 (model): LlamaModel(
 (embed_tokens): Embedding(102400, 2048)
 (layers): ModuleList(
 (0-23): 24 x LlamaDecoderLayer(
 (self_attn): LlamaAttention(
 (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
 (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
 (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
 (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
 )
 (mlp): LlamaMLP(
 (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
 (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
 (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
 (act_fn): SiLU()
 )
 (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
 (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
 )
 )
 (norm): LlamaRMSNorm((2048,), eps=1e-06)
 (rotary_emb): LlamaRotaryEmbedding()
 )
 (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
)


In [5]:
print(vl_gpt)

MultiModalityCausalLM(
 (vision_model): CLIPVisionTower(
 (vision_tower): VisionTransformer(
 (patch_embed): PatchEmbed(
 (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
 (norm): Identity()
 )
 (pos_drop): Dropout(p=0.0, inplace=False)
 (patch_drop): Identity()
 (norm_pre): Identity()
 (blocks): Sequential(
 (0): Block(
 (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
 (attn): Attention(
 (qkv): Linear(in_features=1024, out_features=3072, bias=True)
 (q_norm): Identity()
 (k_norm): Identity()
 (attn_drop): Dropout(p=0.0, inplace=False)
 (proj): Linear(in_features=1024, out_features=1024, bias=True)
 (proj_drop): Identity()
 )
 (ls1): Identity()
 (drop_path1): Identity()
 (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
 (mlp): Mlp(
 (fc1): Linear(in_features=1024, out_features=4096, bias=True)
 (act): GELU(approximate='none')
 (drop1): Dropout(p=0.0, inplace=False)
 (norm): Identity()
 (fc2): Linear(in_features=4096, out_features=1024, bias