""" File: vlm.py Description: Vision language model utility functions. Heavily inspired (i.e. copied) from https://huggingface.co./spaces/HuggingFaceTB/SmolVLM2/blob/main/app.py Author: Didier Guillevic Date: 2025-04-02 """ from transformers import AutoProcessor, AutoModelForImageTextToText from transformers import TextIteratorStreamer from threading import Thread import re import time import torch import spaces import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) from io import BytesIO # # Load the model: HuggingFaceTB/SmolVLM2-2.2B-Instruct # model_id = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" device = 'cuda' if torch.cuda.is_available() else 'cpu' processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForImageTextToText.from_pretrained( model_id, _attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16 ).to(device) # # Build messages # def build_messages(input_dict: dict, history: list[tuple]): """Build messages given message & history from a **multimodal** chat interface. Args: input_dict: dictionary with keys: 'text', 'files' history: list of tuples with (message, response) Returns: list of messages (to be sent to the model) """ text = input_dict["text"] images = [] user_content = [] media_queue = [] if history == []: text = input_dict["text"].strip() for file in input_dict.get("files", []): if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")): media_queue.append({"type": "image", "path": file}) elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")): media_queue.append({"type": "video", "path": file}) if "" in text or "