Spaces:
Running
Running
# Import necessary libraries | |
import streamlit as st # Streamlit for creating the web application | |
from transformers import pipeline # Pipeline for using Hugging Face models | |
from PIL import Image # PIL for image processing | |
# Function to load models | |
def load_models(): | |
# Load the image to text model | |
caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") # Load pre-trained image to text model | |
# Load the text generation model | |
story_model = pipeline("text-generation", model="gpt2") # Load pre-trained text generation model | |
# Load the text-to-speech model | |
tts_model = pipeline("text-to-speech", model="suno/bark") # Load a TTS model | |
return caption_model, story_model, tts_model # Return all three models | |
# Function to generate story from caption | |
def generate_story(caption, story_model): | |
# Generate a story based on the caption | |
story = story_model(caption, max_length=100, num_return_sequences=1)[0]['generated_text'] # Generate the story | |
return story # Return the generated story | |
# Function to convert text to audio | |
def text_to_audio(text, tts_model): | |
audio = tts_model(text) # Generate audio from text using the TTS model | |
return audio # Return the audio object | |
# Function to process the uploaded image and generate a story | |
def process_image(image, caption_model, story_model): | |
# Generate a caption from the uploaded image | |
result = caption_model(image) # Get the result from the model | |
caption = result[0]['generated_text'] # Access the generated caption | |
# Generate a story from the caption | |
story = generate_story(caption, story_model) # Call the story generation function | |
return caption, story # Return both caption and story | |
# Main part | |
def main(): | |
st.set_page_config(page_title="Storytelling Friend", page_icon="🦦") # Title of the application | |
st.write("Upload an image to generate a story!") # Instructions for the user | |
# Upload image section | |
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) # File uploader for images | |
# Load models once | |
caption_model, story_model, tts_model = load_models() # Load models | |
if uploaded_file is not None: | |
# Open and read the uploaded image | |
image = Image.open(uploaded_file) # Open the uploaded image file | |
st.image(image, caption="Uploaded Image", use_container_width=True) # Display the uploaded image | |
# Process the image and generate story | |
caption, story = process_image(image, caption_model, story_model) # Get caption and story | |
st.subheader("Generated Caption:") # Subheader for caption | |
st.write(caption) # Display the caption | |
st.subheader("Generated Story:") # Subheader for story | |
st.write(story) # Display the generated story | |
# Convert story to audio and play it | |
audio = text_to_audio(story, tts_model) # Convert story to audio | |
st.audio(audio, format='audio/wav') # Play the audio | |
# Run the app | |
if __name__ == "__main__": | |
main() # Call the main function to run the app |