|
import torch |
|
import numpy as np |
|
import spaces |
|
from PIL import Image, ImageDraw, ImageFont |
|
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration |
|
from transformers import CLIPProcessor, CLIPModel |
|
from janus.models import MultiModalityCausalLM, VLChatProcessor |
|
|
|
@spaces.GPU(duration=120) |
|
def set_dtype_device(model, precision=16, device_map=None): |
|
dtype = (torch.bfloat16 if torch.cuda.is_available() else torch.float16) if precision==16 else (torch.bfloat32 if torch.cuda.is_available() else torch.float32) |
|
cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
if torch.cuda.is_available(): |
|
model = model.to(dtype) |
|
if not device_map: |
|
model.cuda() |
|
else: |
|
torch.set_default_device("cpu") |
|
model = model.to(dtype) |
|
return model, dtype, cuda_device |
|
|
|
|
|
class Model_Utils: |
|
def __init__(self): |
|
pass |
|
|
|
@spaces.GPU(duration=120) |
|
def prepare_inputs(self): |
|
raise NotImplementedError |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_outputs(self): |
|
raise NotImplementedError |
|
|
|
|
|
|
|
class Clip_Utils(Model_Utils): |
|
def __init__(self): |
|
self.edge = 224 |
|
super().__init__() |
|
|
|
def init_Clip(self): |
|
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
self.processor.feature_extractor.size = {"height": self.edge, "width": self.edge} |
|
|
|
@spaces.GPU(duration=120) |
|
def prepare_inputs(self, question_lst, image): |
|
image = Image.fromarray(image) |
|
|
|
inputs = self.processor(text=question_lst, images=image, return_tensors="pt", padding=True) |
|
return inputs |
|
|
|
|
|
class Janus_Utils(Model_Utils): |
|
def __init__(self): |
|
super().__init__() |
|
|
|
def init_Janus(self, num_params="1B"): |
|
|
|
model_path = f"deepseek-ai/Janus-Pro-{num_params}" |
|
config = AutoConfig.from_pretrained(model_path) |
|
language_config = config.language_config |
|
language_config._attn_implementation = 'eager' |
|
self.vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, |
|
language_config=language_config, |
|
trust_remote_code=True, |
|
ignore_mismatched_sizes=True, |
|
) |
|
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) |
|
self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) |
|
self.tokenizer = self.vl_chat_processor.tokenizer |
|
|
|
return self.vl_gpt, self.tokenizer |
|
|
|
@spaces.GPU(duration=120) |
|
def prepare_inputs(self, question, image, answer=None): |
|
conversation = [ |
|
{ |
|
"role": "<|User|>", |
|
"content": f"<image_placeholder>\n{question}", |
|
"images": [image], |
|
}, |
|
{"role": "<|Assistant|>", "content": answer if answer else ""} |
|
] |
|
|
|
pil_images = [Image.fromarray(image)] |
|
prepare_inputs = self.vl_chat_processor( |
|
conversations=conversation, images=pil_images, force_batchify=True |
|
).to(self.cuda_device, dtype=self.dtype) |
|
|
|
return prepare_inputs |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_inputs_embeddings(self, prepare_inputs): |
|
return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_outputs(self, inputs_embeds, prepare_inputs, temperature, top_p, with_attn=False): |
|
|
|
outputs = self.vl_gpt.language_model.generate( |
|
inputs_embeds=inputs_embeds, |
|
attention_mask=prepare_inputs.attention_mask, |
|
pad_token_id=self.tokenizer.eos_token_id, |
|
bos_token_id=self.tokenizer.bos_token_id, |
|
eos_token_id=self.tokenizer.eos_token_id, |
|
max_new_tokens=512, |
|
do_sample=False if temperature == 0 else True, |
|
use_cache=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
return_dict_in_generate=True, |
|
output_attentions=True |
|
) |
|
|
|
return outputs |
|
|
|
class LLaVA_Utils(Model_Utils): |
|
def __init__(self): |
|
super().__init__() |
|
|
|
def init_LLaVA(self, version): |
|
if version == "1.5": |
|
model_path = "llava-hf/llava-1.5-7b-hf" |
|
config = AutoConfig.from_pretrained(model_path) |
|
|
|
self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path, |
|
low_cpu_mem_usage=True, |
|
attn_implementation = 'eager', |
|
device_map="auto", |
|
output_attentions=True |
|
) |
|
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) |
|
self.processor = AutoProcessor.from_pretrained(model_path) |
|
self.tokenizer = self.processor.tokenizer |
|
|
|
else: |
|
model_path = "llava-hf/llava-onevision-qwen2-7b-si-hf" |
|
|
|
self.processor = AutoProcessor.from_pretrained(model_path) |
|
|
|
self.vl_gpt = LlavaOnevisionForConditionalGeneration.from_pretrained(model_path, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
low_cpu_mem_usage=True, |
|
attn_implementation = 'eager', |
|
output_attentions=True) |
|
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt, device_map="auto") |
|
|
|
self.tokenizer = self.processor.tokenizer |
|
|
|
return self.vl_gpt, self.tokenizer |
|
|
|
@spaces.GPU(duration=120) |
|
def prepare_inputs(self, question, image, answer=None): |
|
if answer: |
|
conversation = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": question}, |
|
{"type": "image"}, |
|
], |
|
}, |
|
{ |
|
"role": "assistant", |
|
"content": [ |
|
{"type": "text", "text": answer}, |
|
], |
|
} |
|
] |
|
else: |
|
conversation = [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "text", "text": question}, |
|
{"type": "image"}, |
|
], |
|
}, |
|
] |
|
|
|
prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) |
|
pil_images = [Image.fromarray(image).resize((384, 384))] |
|
prepare_inputs = self.processor( |
|
images=pil_images, text=prompt, return_tensors="pt" |
|
).to(self.cuda_device, dtype=self.dtype) |
|
|
|
return prepare_inputs |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_inputs_embeddings(self, prepare_inputs): |
|
return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_outputs(self, prepare_inputs, temperature, top_p): |
|
|
|
outputs = self.vl_gpt.generate( |
|
**prepare_inputs, |
|
max_new_tokens=512, |
|
do_sample=False if temperature == 0 else True, |
|
use_cache=True, |
|
return_dict_in_generate=True, |
|
output_attentions=True |
|
) |
|
|
|
return outputs |
|
|
|
|
|
|
|
|
|
|
|
class ChartGemma_Utils(Model_Utils): |
|
def __init__(self): |
|
super().__init__() |
|
|
|
def init_ChartGemma(self): |
|
|
|
model_path = "ahmed-masry/chartgemma" |
|
|
|
self.vl_gpt = PaliGemmaForConditionalGeneration.from_pretrained( |
|
model_path, |
|
torch_dtype=torch.float16, |
|
attn_implementation="eager", |
|
output_attentions=True |
|
) |
|
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) |
|
self.processor = AutoProcessor.from_pretrained(model_path) |
|
self.tokenizer = self.processor.tokenizer |
|
|
|
return self.vl_gpt, self.tokenizer |
|
|
|
@spaces.GPU(duration=120) |
|
def prepare_inputs(self, question, image): |
|
|
|
pil_image = Image.fromarray(image) |
|
prepare_inputs = self.processor( |
|
images=pil_image, text=[question], return_tensors="pt" |
|
).to(self.cuda_device, dtype=self.dtype) |
|
|
|
return prepare_inputs |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_inputs_embeddings(self, prepare_inputs): |
|
return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) |
|
|
|
@spaces.GPU(duration=120) |
|
def generate_outputs(self, prepare_inputs, temperature, top_p): |
|
|
|
outputs = self.vl_gpt.generate( |
|
**prepare_inputs, |
|
max_new_tokens=512, |
|
do_sample=False if temperature == 0 else True, |
|
use_cache=True, |
|
return_dict_in_generate=True, |
|
output_attentions=True |
|
) |
|
|
|
return outputs |
|
|
|
|
|
|
|
|
|
def add_title_to_image(image, title, font_size=50): |
|
"""Adds a title above an image using PIL and textbbox().""" |
|
img_width, img_height = image.size |
|
|
|
|
|
title_height = font_size + 10 |
|
title_image = Image.new("RGB", (img_width, title_height), color=(255, 255, 255)) |
|
draw = ImageDraw.Draw(title_image) |
|
|
|
|
|
try: |
|
font = ImageFont.truetype("arial.ttf", font_size) |
|
except: |
|
font = ImageFont.load_default(font_size) |
|
|
|
|
|
text_bbox = draw.textbbox((0, 0), title, font=font) |
|
text_width = text_bbox[2] - text_bbox[0] |
|
text_height = text_bbox[3] - text_bbox[1] |
|
|
|
|
|
text_position = ((img_width - text_width) // 2, (title_height - text_height) // 2) |
|
|
|
draw.text(text_position, title, fill="black", font=font) |
|
|
|
|
|
combined = Image.new("RGB", (img_width, img_height + title_height)) |
|
combined.paste(title_image, (0, 0)) |
|
combined.paste(image, (0, title_height)) |
|
|
|
return combined |
|
|
|
|
|
|