Spaces:

joey1101
/

Assignment

Running

App Files Files Community

joey1101 commited on Mar 9

Commit

b1da77e

verified ·

1 Parent(s): 2da2d23

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -62

app.py CHANGED Viewed

@@ -1,65 +1,55 @@
-# Import necessary libraries
-import streamlit as st  # Streamlit for creating the web application
-from transformers import pipeline  # Pipeline for using Hugging Face models
-from PIL import Image  # PIL for image processing
-# Function to load models
-def load_models():
-    # Load the image to text model
-    caption_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")  # Load pre-trained image to text model
     # Load the text generation model
-    story_model = pipeline("text-generation", model="gpt2")  # Load pre-trained text generation model
-    # Load the text-to-speech model
-    tts_model = pipeline("text-to-speech", model="suno/bark")  # Load a TTS model
-    return caption_model, story_model, tts_model  # Return all three models
-# Function to generate story from caption
-def generate_story(caption, story_model):
-    # Generate a story based on the caption
-    story = story_model(caption, max_length=100, num_return_sequences=1)[0]['generated_text']  # Generate the story
-    return story  # Return the generated story
 # Function to convert text to audio
-def text_to_audio(text, tts_model):
-    audio = tts_model(text)  # Generate audio from text using the TTS model
-    return audio  # Return the audio object
-# Function to process the uploaded image and generate a story
-def process_image(image, caption_model, story_model):
-    # Generate a caption from the uploaded image
-    result = caption_model(image)  # Get the result from the model
-    caption = result[0]['generated_text']  # Access the generated caption
-    # Generate a story from the caption
-    story = generate_story(caption, story_model)  # Call the story generation function
-    return caption, story  # Return both caption and story
-# Main part
-def main():
-    st.set_page_config(page_title="Storytelling Friend", page_icon="🦦")  # Title of the application
-    st.write("Upload an image to generate a story!")  # Instructions for the user
-    # Upload image section
-    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])  # File uploader for images
-    # Load models once
-    caption_model, story_model, tts_model = load_models()  # Load models
-    if uploaded_file is not None:
-        # Open and read the uploaded image
-        image = Image.open(uploaded_file)  # Open the uploaded image file
-        st.image(image, caption="Uploaded Image", use_container_width=True)  # Display the uploaded image
-        # Process the image and generate story
-        caption, story = process_image(image, caption_model, story_model)  # Get caption and story
-        st.subheader("Generated Caption:")  # Subheader for caption
-        st.write(caption)  # Display the caption
-        st.subheader("Generated Story:")  # Subheader for story
-        st.write(story)  # Display the generated story
-        # Convert story to audio and play it
-        audio = text_to_audio(story, tts_model)  # Convert story to audio
-        st.audio(audio, format='audio/wav')  # Play the audio
-# Run the app
-if __name__ == "__main__":
-    main()  # Call the main function to run the app

+import streamlit as st  # Streamlit for building the web application
+from transformers import pipeline  # Hugging Face Transformers pipeline for models
+from PIL import Image  # PIL for handling image files
+# Function to convert image to text
+def img2text(image):
+    # Load the image-to-text model
+    image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+    # Generate a caption for the image
+    text = image_to_text_model(image)[0]["generated_text"]
+    return text  # Return the generated caption
+# Function to generate a story based on the caption
+def text2story(text):
     # Load the text generation model
+    story_model = pipeline("text-generation", model="gpt2")
+    # Generate a story based on the input text
+    story_text = story_model(f"Once upon a time, {text}.", max_length=100, num_return_sequences=1)
+    return story_text[0]["generated_text"]  # Return the generated story
 # Function to convert text to audio
+def text2audio(story_text):
+    # Load the text-to-speech model
+    text_to_audio_model = pipeline("text-to-speech", model="facebook/mms-tts-eng")
+    # Generate audio data from the story text
+    audio_data = text_to_audio_model(story_text)
+    return audio_data  # Return the audio data
+# Main part of the application
+st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")  # Set the title and icon of the app
+st.header("Storytelling From Your Image")  # Header for the application
+uploaded_file = st.file_uploader("Select an Image...", type=["jpg", "jpeg", "png"])  # File uploader for images
+if uploaded_file is not None:
+    # Open and read the uploaded image
+    image = Image.open(uploaded_file)  # Use PIL to open the uploaded image
+    st.image(image, caption="Uploaded Image", use_container_width=True)  # Display the uploaded image
+    # Stage 1: Image to Text
+    st.text('Processing image to text...')  # Inform the user about the processing stage
+    scenario = img2text(image)  # Get the caption for the uploaded image
+    st.write("Caption:", scenario)  # Display the generated caption
+    # Stage 2: Text to Story
+    st.text('Generating a story...')  # Inform the user about the story generation stage
+    story = text2story(scenario)  # Generate a story based on the caption
+    st.write("Story:", story)  # Display the generated story
+    # Stage 3: Story to Audio data
+    st.text('Generating audio data...')  # Inform the user about the audio generation stage
+    audio_data = text2audio(story)  # Convert the generated story into audio
+    # Play button for the audio
+    if st.button("Play Audio"):  # Create a button to play the audio
+        st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate=audio_data['sampling_rate'])  # Play the audio