Spaces:

notrey
/

CompVisProj

Running

App Files Files Community

notrey commited on 20 days ago

Commit

999d54c

1 Parent(s): a6197da

updating prj

Browse files

Files changed (2) hide show

app.py +41 -127
requirements.txt +3 -6

app.py CHANGED Viewed

@@ -1,133 +1,47 @@
-import streamlit as st
-from PIL import Image
 import numpy as np
-import tempfile
-import soundfile as sf
-import torch
-import easyocr
-import omegaconf
-# Inject custom CSS
-st.markdown(
     """
-    <style>
-    /* Customize the background and text colors */
-    .reportview-container {
-        background-color: #FFFFFF;
-    }
-    .main .block-container {
-        background-color: #FFFFFF;
-        color: #008000;
-    }
-    /* Customize buttons, headers, etc. */
-    .stButton>button {
-        background-color: #008000;
-        color: #FFFFFF;
-    }
-    </style>
-    """,
-    unsafe_allow_html=True
 )
-# ---------------------------
-# Caching the OCR reader for performance
-# ---------------------------
-@st.cache_resource(show_spinner=False)
-def load_ocr_reader(languages):
-    # EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
-    return easyocr.Reader(languages, gpu=False)
-# ---------------------------
-# Caching TTS model loading (Silero TTS)
-# ---------------------------
-@st.cache_resource(show_spinner=False)
-def load_tts_model(language):
-    # Map our language codes to Silero model speakers.
-    # Note: Silero officially supports 'en' (and some community models for other languages).
-    # For demonstration, if a language isn’t available, we fallback to English.
-    lang_speaker_map = {
-        'en': 'v3_en',
-        'es': 'v3_es',  # if available; otherwise, you might need to train or use an English model
-        'ch': 'v3_en',  # fallback to English for now (or replace with an experimental Chinese model)
-        'ar': 'v3_en'   # fallback to English (or an experimental Arabic model if available)
-    }
-    speaker = lang_speaker_map.get(language, 'v3_en')
-    device = torch.device('cpu')
-    # Load the Silero TTS model from torch.hub.
-    # This command will download the model the first time you run it.
-    model, example_text, sample_rate, speakers = torch.hub.load(
-        repo_or_dir='snakers4/silero-models',
-        model='silero_tts',
-        language=language,
-        speaker=speaker
-    )
-    return model, sample_rate, speaker
-def synthesize_speech(text, language):
-    model, sample_rate, speaker = load_tts_model(language)
-    # Synthesize speech; the output is a NumPy array with the audio waveform.
-    audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
-    return audio, sample_rate
-def save_audio(audio, sample_rate):
-    # Save audio to a temporary file and return its path.
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
-        sf.write(f.name, audio, sample_rate)
-        return f.name
-def extract_text_from_image(image_array, languages):
-    reader = load_ocr_reader(languages)
-    results = reader.readtext(image_array)
-    # Concatenate detected text parts.
-    extracted_text = " ".join([res[1] for res in results])
-    return extracted_text
-# ---------------------------
-# Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
-# ---------------------------
-ocr_language_map = {
-    'en': 'en',
-    'es': 'es',
-    'ch': 'ch_sim',
-    'ar': 'ar'
-}
-# ---------------------------
-# Streamlit App UI
-# ---------------------------
-st.title("Image-to-Audio Description App")
-st.write("Upload an image or enter text to generate audio descriptions.")
-# Select language for both OCR and TTS
-language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
-# Choose input method
-input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
-text = ""
-if input_method == "Upload Image":
-    uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
-    if uploaded_file is not None:
-        image = Image.open(uploaded_file)
-        st.image(image, caption='Uploaded Image', use_column_width=True)
-        # Convert PIL image to numpy array for EasyOCR
-        image_array = np.array(image)
-        with st.spinner("Extracting text from image..."):
-            # EasyOCR expects language codes; here we wrap our choice.
-            ocr_lang = [ocr_language_map.get(language, 'en')]
-            text = extract_text_from_image(image_array, ocr_lang)
-        st.write("**Extracted Text:**")
-        st.write(text)
-else:
-    text = st.text_area("Enter text to synthesize", "Type your description here...")
-if text and st.button("Generate Speech"):
-    with st.spinner("Synthesizing speech..."):
-        audio, sr = synthesize_speech(text, language)
-        audio_file = save_audio(audio, sr)
-    st.success("Audio generated!")
-    st.audio(audio_file)

+import cv2
+import gradio as gr
 import numpy as np
+from fer import FER
+# Initialize the pre-trained detector once so you don't reinitialize on every function call.
+detector = FER(mtcnn=True)  # Optionally, you can set mtcnn to False to use a faster (but less accurate) cascade.
+def emotion_recognition(image):
     """
+    Process the input image, detect emotions on faces,
+    and annotate the image with bounding boxes and emotion labels.
+    Parameters:
+        image (numpy.ndarray): Input image (RGB).
+    Returns:
+        numpy.ndarray: Annotated image with emotion labels.
+    """
+    # fer works with RGB images which is what Gradio provides by default.
+    results = detector.detect_emotions(image)
+    annotated_image = image.copy()
+    # Loop through each detected face
+    for face in results:
+        (x, y, w, h) = face["box"]
+        # Get the dominant emotion for the detected face
+        dominant_emotion = max(face["emotions"].items(), key=lambda item: item[1])[0]
+        # Draw bounding box around face
+        cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
+        # Put the emotion label above the bounding box
+        cv2.putText(annotated_image, dominant_emotion, (x, y - 10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
+    return annotated_image
+# Create a Gradio Interface
+interface = gr.Interface(
+    fn=emotion_recognition,
+    inputs=gr.Image(type="numpy", label="Input Image"),
+    outputs=gr.Image(type="numpy", label="Annotated Image"),
+    title="Facial Emotion Recognition",
+    description="Upload an image and let the app detect and annotate facial emotions."
 )
+# Run the app locally
+if __name__ == "__main__":
+    interface.launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,4 @@
-streamlit
-easyocr
-torch
-soundfile
-Pillow
 numpy
-omegaconf

+gradio
+opencv-python
+fer
 numpy