import torch import numpy as np import spaces from PIL import Image, ImageDraw, ImageFont from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaOnevisionForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration from transformers import CLIPProcessor, CLIPModel from janus.models import MultiModalityCausalLM, VLChatProcessor @spaces.GPU(duration=120) def set_dtype_device(model, precision=16, device_map=None): dtype = (torch.bfloat16 if torch.cuda.is_available() else torch.float16) if precision==16 else (torch.bfloat32 if torch.cuda.is_available() else torch.float32) cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu' if torch.cuda.is_available(): model = model.to(dtype) if not device_map: model.cuda() else: torch.set_default_device("cpu") model = model.to(dtype) return model, dtype, cuda_device class Model_Utils: def __init__(self): pass @spaces.GPU(duration=120) def prepare_inputs(self): raise NotImplementedError @spaces.GPU(duration=120) def generate_outputs(self): raise NotImplementedError class Clip_Utils(Model_Utils): def __init__(self): self.edge = 224 super().__init__() def init_Clip(self): self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") self.processor.feature_extractor.size = {"height": self.edge, "width": self.edge} @spaces.GPU(duration=120) def prepare_inputs(self, question_lst, image): image = Image.fromarray(image) # print("image_size: ", image.size) inputs = self.processor(text=question_lst, images=image, return_tensors="pt", padding=True) return inputs class Janus_Utils(Model_Utils): def __init__(self): super().__init__() def init_Janus(self, num_params="1B"): model_path = f"deepseek-ai/Janus-Pro-{num_params}" config = AutoConfig.from_pretrained(model_path) language_config = config.language_config language_config._attn_implementation = 'eager' self.vl_gpt = AutoModelForCausalLM.from_pretrained(model_path, language_config=language_config, trust_remote_code=True, ignore_mismatched_sizes=True, ) self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) self.tokenizer = self.vl_chat_processor.tokenizer return self.vl_gpt, self.tokenizer @spaces.GPU(duration=120) def prepare_inputs(self, question, image, answer=None): conversation = [ { "role": "<|User|>", "content": f"\n{question}", "images": [image], }, {"role": "<|Assistant|>", "content": answer if answer else ""} ] pil_images = [Image.fromarray(image)] prepare_inputs = self.vl_chat_processor( conversations=conversation, images=pil_images, force_batchify=True ).to(self.cuda_device, dtype=self.dtype) return prepare_inputs @spaces.GPU(duration=120) def generate_inputs_embeddings(self, prepare_inputs): return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) @spaces.GPU(duration=120) def generate_outputs(self, inputs_embeds, prepare_inputs, temperature, top_p, with_attn=False): outputs = self.vl_gpt.language_model.generate( inputs_embeds=inputs_embeds, attention_mask=prepare_inputs.attention_mask, pad_token_id=self.tokenizer.eos_token_id, bos_token_id=self.tokenizer.bos_token_id, eos_token_id=self.tokenizer.eos_token_id, max_new_tokens=512, do_sample=False if temperature == 0 else True, use_cache=True, temperature=temperature, top_p=top_p, return_dict_in_generate=True, output_attentions=True ) return outputs class LLaVA_Utils(Model_Utils): def __init__(self): super().__init__() def init_LLaVA(self, version): if version == "1.5": model_path = "llava-hf/llava-1.5-7b-hf" config = AutoConfig.from_pretrained(model_path) self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, attn_implementation = 'eager', device_map="auto", output_attentions=True ) self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) self.processor = AutoProcessor.from_pretrained(model_path) self.tokenizer = self.processor.tokenizer else: model_path = "llava-hf/llava-onevision-qwen2-7b-si-hf" self.processor = AutoProcessor.from_pretrained(model_path) self.vl_gpt = LlavaOnevisionForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, attn_implementation = 'eager', output_attentions=True) self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt, device_map="auto") self.tokenizer = self.processor.tokenizer return self.vl_gpt, self.tokenizer @spaces.GPU(duration=120) def prepare_inputs(self, question, image, answer=None): if answer: conversation = [ { "role": "user", "content": [ {"type": "text", "text": question}, {"type": "image"}, ], }, { "role": "assistant", "content": [ {"type": "text", "text": answer}, ], } ] else: conversation = [ { "role": "user", "content": [ {"type": "text", "text": question}, {"type": "image"}, ], }, ] prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True) pil_images = [Image.fromarray(image).resize((384, 384))] prepare_inputs = self.processor( images=pil_images, text=prompt, return_tensors="pt" ).to(self.cuda_device, dtype=self.dtype) return prepare_inputs @spaces.GPU(duration=120) def generate_inputs_embeddings(self, prepare_inputs): return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) @spaces.GPU(duration=120) def generate_outputs(self, prepare_inputs, temperature, top_p): outputs = self.vl_gpt.generate( **prepare_inputs, max_new_tokens=512, do_sample=False if temperature == 0 else True, use_cache=True, return_dict_in_generate=True, output_attentions=True ) return outputs class ChartGemma_Utils(Model_Utils): def __init__(self): super().__init__() def init_ChartGemma(self): model_path = "ahmed-masry/chartgemma" self.vl_gpt = PaliGemmaForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.float16, attn_implementation="eager", output_attentions=True ) self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt) self.processor = AutoProcessor.from_pretrained(model_path) self.tokenizer = self.processor.tokenizer return self.vl_gpt, self.tokenizer @spaces.GPU(duration=120) def prepare_inputs(self, question, image): pil_image = Image.fromarray(image) prepare_inputs = self.processor( images=pil_image, text=[question], return_tensors="pt" ).to(self.cuda_device, dtype=self.dtype) return prepare_inputs @spaces.GPU(duration=120) def generate_inputs_embeddings(self, prepare_inputs): return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs) @spaces.GPU(duration=120) def generate_outputs(self, prepare_inputs, temperature, top_p): outputs = self.vl_gpt.generate( **prepare_inputs, max_new_tokens=512, do_sample=False if temperature == 0 else True, use_cache=True, return_dict_in_generate=True, output_attentions=True ) return outputs def add_title_to_image(image, title, font_size=50): """Adds a title above an image using PIL and textbbox().""" img_width, img_height = image.size # Create a blank image for title title_height = font_size + 10 # Some padding title_image = Image.new("RGB", (img_width, title_height), color=(255, 255, 255)) # White background draw = ImageDraw.Draw(title_image) # Load font try: font = ImageFont.truetype("arial.ttf", font_size) # Use Arial if available except: font = ImageFont.load_default(font_size) # Use default if Arial not found # Get text size (updated for PIL >= 10) text_bbox = draw.textbbox((0, 0), title, font=font) text_width = text_bbox[2] - text_bbox[0] text_height = text_bbox[3] - text_bbox[1] # Center the title text_position = ((img_width - text_width) // 2, (title_height - text_height) // 2) draw.text(text_position, title, fill="black", font=font) # Concatenate title with image combined = Image.new("RGB", (img_width, img_height + title_height)) combined.paste(title_image, (0, 0)) # Place title at the top combined.paste(image, (0, title_height)) # Place original image below return combined