import os import gradio as gr import matplotlib.pyplot as plt from model_utils import ( analyze_sentiment, answer_question, generate_caption, load_image_captioning_model, load_sentiment_model, load_vqa_model, ) # Load models at startup print("Loading models...") image_caption_model, image_caption_processor, image_caption_tokenizer = ( load_image_captioning_model() ) vqa_model, vqa_processor, vqa_tokenizer = load_vqa_model() sentiment_model, sentiment_tokenizer = load_sentiment_model() print("Models loaded successfully!") def image_caption_fn(image): # Save image temporarily temp_path = "temp_image.jpg" image.save(temp_path) # Generate caption caption = generate_caption( temp_path, image_caption_model, image_caption_processor, image_caption_tokenizer ) # Clean up if os.path.exists(temp_path): os.remove(temp_path) return caption def vqa_fn(image, question): # Save image temporarily temp_path = "temp_image.jpg" image.save(temp_path) # Answer question answer = answer_question( temp_path, question, vqa_model, vqa_processor, vqa_tokenizer ) # Clean up if os.path.exists(temp_path): os.remove(temp_path) return answer def sentiment_fn(text): sentiment, confidence = analyze_sentiment( text, sentiment_model, sentiment_tokenizer ) confidence_percentage = f"{confidence:.2%}" # Create a simple bar chart for visualization labels = ["Negative", "Positive"] values = ( [1 - confidence, confidence] if sentiment == "positive" else [confidence, 1 - confidence] ) fig, ax = plt.subplots(figsize=(6, 3)) bars = ax.bar(labels, values, color=["#FF6B6B", "#4ECDC4"]) ax.set_ylim(0, 1) ax.set_title("Sentiment Analysis") for bar in bars: height = bar.get_height() ax.text( bar.get_x() + bar.get_width() / 2.0, height + 0.02, f"{height:.2f}", ha="center", va="bottom", ) return f"Sentiment: {sentiment.upper()} (Confidence: {confidence_percentage})", fig # Create the Gradio interface with gr.Blocks(title="Multi-Modal AI Demo") as demo: gr.Markdown("# Multi-Modal AI Demo") gr.Markdown( "This application demonstrates multi-modal AI capabilities using Hugging Face models." ) with gr.Tab("Image Captioning"): gr.Markdown("## Image Captioning") gr.Markdown("Upload an image to generate a descriptive caption.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Input Image") caption_button = gr.Button("Generate Caption") with gr.Column(): caption_output = gr.Textbox( label="Generated Caption", interactive=False ) caption_button.click( fn=image_caption_fn, inputs=[image_input], outputs=[caption_output] ) with gr.Tab("Visual Question Answering"): gr.Markdown("## Visual Question Answering") gr.Markdown("Upload an image and ask a question about it.") with gr.Row(): with gr.Column(): vqa_image_input = gr.Image(type="pil", label="Input Image") vqa_question_input = gr.Textbox(label="Your Question") vqa_button = gr.Button("Get Answer") with gr.Column(): vqa_output = gr.Textbox(label="Answer", interactive=False) vqa_button.click( fn=vqa_fn, inputs=[vqa_image_input, vqa_question_input], outputs=[vqa_output], ) with gr.Tab("Sentiment Analysis"): gr.Markdown("## Sentiment Analysis") gr.Markdown("Enter some text to analyze its sentiment.") with gr.Row(): with gr.Column(): sentiment_input = gr.Textbox(label="Input Text") sentiment_button = gr.Button("Analyze Sentiment") with gr.Column(): sentiment_output = gr.Textbox(label="Result", interactive=False) sentiment_plot = gr.Plot(label="Sentiment Distribution") sentiment_button.click( fn=sentiment_fn, inputs=[sentiment_input], outputs=[sentiment_output, sentiment_plot], ) gr.Markdown("### About") gr.Markdown(""" This demo uses the following pretrained models from Hugging Face: - Image Captioning: `nlpconnect/vit-gpt2-image-captioning` - Visual Question Answering: `nlpconnect/vit-gpt2-image-captioning` (simplified) - Sentiment Analysis: `distilbert-base-uncased-finetuned-sst-2-english` """) # Launch the demo if __name__ == "__main__": demo.launch(share=True)