Spaces:

Amarthya7
/

Multi-model-ai-demo

Sleeping

App Files Files Community

Multi-model-ai-demo / model_utils.py

Amarthya7

Upload 4 files

ae1d6c7 verified about 2 months ago

raw

history blame contribute delete

3.18 kB

	import torch
	from PIL import Image
	from transformers import (
	AutoModelForSequenceClassification,
	AutoTokenizer,
	VisionEncoderDecoderModel,
	ViTImageProcessor,
	)

	# Device configuration
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	# Load image captioning model
	def load_image_captioning_model():
	model = VisionEncoderDecoderModel.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	).to(device)
	feature_extractor = ViTImageProcessor.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	)
	tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

	return model, feature_extractor, tokenizer


	# Generate caption for an image
	def generate_caption(image_path, model, feature_extractor, tokenizer):
	max_length = 16
	num_beams = 4

	image = Image.open(image_path).convert("RGB")
	pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
	device
	)

	with torch.no_grad():
	output_ids = model.generate(
	pixel_values, max_length=max_length, num_beams=num_beams
	)

	preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
	return preds[0].strip()


	# Load visual question answering model
	def load_vqa_model():
	# For simplicity, we'll use the same image captioning model
	# In a real application, you would use a dedicated VQA model
	model = VisionEncoderDecoderModel.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	).to(device)
	feature_extractor = ViTImageProcessor.from_pretrained(
	"nlpconnect/vit-gpt2-image-captioning"
	)
	tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

	return model, feature_extractor, tokenizer


	# Answer a question about an image
	def answer_question(image_path, question, model, feature_extractor, tokenizer):
	# This is a simplified version - in a real app, you'd use a proper VQA model
	# Here we just generate a caption and append it to a template
	caption = generate_caption(image_path, model, feature_extractor, tokenizer)
	return f"Based on the image which shows {caption}, I would say: {caption}"


	# Load sentiment analysis model
	def load_sentiment_model():
	model_name = "distilbert-base-uncased-finetuned-sst-2-english"
	model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	return model, tokenizer


	# Analyze sentiment of text
	def analyze_sentiment(text, model, tokenizer):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
	device
	)

	with torch.no_grad():
	outputs = model(**inputs)

	scores = torch.nn.functional.softmax(outputs.logits, dim=1)
	scores = scores.cpu().numpy()[0]

	# DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
	sentiment = "positive" if scores[1] > scores[0] else "negative"
	confidence = float(max(scores))

	return sentiment, confidence