Spaces:
Sleeping
Sleeping
File size: 3,179 Bytes
ae1d6c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import torch
from PIL import Image
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
VisionEncoderDecoderModel,
ViTImageProcessor,
)
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load image captioning model
def load_image_captioning_model():
model = VisionEncoderDecoderModel.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning"
).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning"
)
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
return model, feature_extractor, tokenizer
# Generate caption for an image
def generate_caption(image_path, model, feature_extractor, tokenizer):
max_length = 16
num_beams = 4
image = Image.open(image_path).convert("RGB")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
device
)
with torch.no_grad():
output_ids = model.generate(
pixel_values, max_length=max_length, num_beams=num_beams
)
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
return preds[0].strip()
# Load visual question answering model
def load_vqa_model():
# For simplicity, we'll use the same image captioning model
# In a real application, you would use a dedicated VQA model
model = VisionEncoderDecoderModel.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning"
).to(device)
feature_extractor = ViTImageProcessor.from_pretrained(
"nlpconnect/vit-gpt2-image-captioning"
)
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
return model, feature_extractor, tokenizer
# Answer a question about an image
def answer_question(image_path, question, model, feature_extractor, tokenizer):
# This is a simplified version - in a real app, you'd use a proper VQA model
# Here we just generate a caption and append it to a template
caption = generate_caption(image_path, model, feature_extractor, tokenizer)
return f"Based on the image which shows {caption}, I would say: {caption}"
# Load sentiment analysis model
def load_sentiment_model():
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
return model, tokenizer
# Analyze sentiment of text
def analyze_sentiment(text, model, tokenizer):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
device
)
with torch.no_grad():
outputs = model(**inputs)
scores = torch.nn.functional.softmax(outputs.logits, dim=1)
scores = scores.cpu().numpy()[0]
# DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
sentiment = "positive" if scores[1] > scores[0] else "negative"
confidence = float(max(scores))
return sentiment, confidence
|