import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer from threading import Thread import re import time import torch import spaces import subprocess from io import BytesIO subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct") model = AutoModelForImageTextToText.from_pretrained( "HuggingFaceTB/SmolVLM2-256M-Video-Instruct", _attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16 ).to("cuda:0") @spaces.GPU def model_inference( input_dict, history, max_tokens ): text = input_dict["text"].strip() user_content = [] media_queue = [] for file_path in input_dict.get("files", []): if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")): media_queue.append({"type": "image", "path": file_path}) elif file_path.lower().endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")): media_queue.append({"type": "video", "path": file_path}) if not text and not media_queue: gr.Warning("Please input a query and optionally image(s)/video(s).") return if not text and media_queue: gr.Warning("Please input a text query along with the image(s)/video(s).") return if "" in text or "