import torch from PIL import Image from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, VisionEncoderDecoderModel, ViTImageProcessor, ) # Device configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load image captioning model def load_image_captioning_model(): model = VisionEncoderDecoderModel.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ).to(device) feature_extractor = ViTImageProcessor.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ) tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") return model, feature_extractor, tokenizer # Generate caption for an image def generate_caption(image_path, model, feature_extractor, tokenizer): max_length = 16 num_beams = 4 image = Image.open(image_path).convert("RGB") pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to( device ) with torch.no_grad(): output_ids = model.generate( pixel_values, max_length=max_length, num_beams=num_beams ) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) return preds[0].strip() # Load visual question answering model def load_vqa_model(): # For simplicity, we'll use the same image captioning model # In a real application, you would use a dedicated VQA model model = VisionEncoderDecoderModel.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ).to(device) feature_extractor = ViTImageProcessor.from_pretrained( "nlpconnect/vit-gpt2-image-captioning" ) tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") return model, feature_extractor, tokenizer # Answer a question about an image def answer_question(image_path, question, model, feature_extractor, tokenizer): # This is a simplified version - in a real app, you'd use a proper VQA model # Here we just generate a caption and append it to a template caption = generate_caption(image_path, model, feature_extractor, tokenizer) return f"Based on the image which shows {caption}, I would say: {caption}" # Load sentiment analysis model def load_sentiment_model(): model_name = "distilbert-base-uncased-finetuned-sst-2-english" model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer # Analyze sentiment of text def analyze_sentiment(text, model, tokenizer): inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to( device ) with torch.no_grad(): outputs = model(**inputs) scores = torch.nn.functional.softmax(outputs.logits, dim=1) scores = scores.cpu().numpy()[0] # DistilBERT-SST2 has 2 labels: negative (0) and positive (1) sentiment = "positive" if scores[1] > scores[0] else "negative" confidence = float(max(scores)) return sentiment, confidence