import torch
from PIL import Image
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    VisionEncoderDecoderModel,
    ViTImageProcessor,
)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load image captioning model
def load_image_captioning_model():
    model = VisionEncoderDecoderModel.from_pretrained(
        "nlpconnect/vit-gpt2-image-captioning"
    ).to(device)
    feature_extractor = ViTImageProcessor.from_pretrained(
        "nlpconnect/vit-gpt2-image-captioning"
    )
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

    return model, feature_extractor, tokenizer


# Generate caption for an image
def generate_caption(image_path, model, feature_extractor, tokenizer):
    max_length = 16
    num_beams = 4

    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(
        device
    )

    with torch.no_grad():
        output_ids = model.generate(
            pixel_values, max_length=max_length, num_beams=num_beams
        )

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return preds[0].strip()


# Load visual question answering model
def load_vqa_model():
    # For simplicity, we'll use the same image captioning model
    # In a real application, you would use a dedicated VQA model
    model = VisionEncoderDecoderModel.from_pretrained(
        "nlpconnect/vit-gpt2-image-captioning"
    ).to(device)
    feature_extractor = ViTImageProcessor.from_pretrained(
        "nlpconnect/vit-gpt2-image-captioning"
    )
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

    return model, feature_extractor, tokenizer


# Answer a question about an image
def answer_question(image_path, question, model, feature_extractor, tokenizer):
    # This is a simplified version - in a real app, you'd use a proper VQA model
    # Here we just generate a caption and append it to a template
    caption = generate_caption(image_path, model, feature_extractor, tokenizer)
    return f"Based on the image which shows {caption}, I would say: {caption}"


# Load sentiment analysis model
def load_sentiment_model():
    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer


# Analyze sentiment of text
def analyze_sentiment(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(
        device
    )

    with torch.no_grad():
        outputs = model(**inputs)

    scores = torch.nn.functional.softmax(outputs.logits, dim=1)
    scores = scores.cpu().numpy()[0]

    # DistilBERT-SST2 has 2 labels: negative (0) and positive (1)
    sentiment = "positive" if scores[1] > scores[0] else "negative"
    confidence = float(max(scores))

    return sentiment, confidence