Spaces:
Running
Running
updating prj
Browse files- app.py +41 -127
- requirements.txt +3 -6
app.py
CHANGED
@@ -1,133 +1,47 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
import numpy as np
|
4 |
-
import
|
5 |
-
import soundfile as sf
|
6 |
-
import torch
|
7 |
-
import easyocr
|
8 |
-
import omegaconf
|
9 |
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
st.markdown(
|
13 |
"""
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
.
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
)
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
# Caching the OCR reader for performance
|
37 |
-
# ---------------------------
|
38 |
-
@st.cache_resource(show_spinner=False)
|
39 |
-
def load_ocr_reader(languages):
|
40 |
-
# EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
|
41 |
-
return easyocr.Reader(languages, gpu=False)
|
42 |
-
|
43 |
-
# ---------------------------
|
44 |
-
# Caching TTS model loading (Silero TTS)
|
45 |
-
# ---------------------------
|
46 |
-
@st.cache_resource(show_spinner=False)
|
47 |
-
def load_tts_model(language):
|
48 |
-
# Map our language codes to Silero model speakers.
|
49 |
-
# Note: Silero officially supports 'en' (and some community models for other languages).
|
50 |
-
# For demonstration, if a language isn’t available, we fallback to English.
|
51 |
-
lang_speaker_map = {
|
52 |
-
'en': 'v3_en',
|
53 |
-
'es': 'v3_es', # if available; otherwise, you might need to train or use an English model
|
54 |
-
'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model)
|
55 |
-
'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available)
|
56 |
-
}
|
57 |
-
speaker = lang_speaker_map.get(language, 'v3_en')
|
58 |
-
device = torch.device('cpu')
|
59 |
-
# Load the Silero TTS model from torch.hub.
|
60 |
-
# This command will download the model the first time you run it.
|
61 |
-
model, example_text, sample_rate, speakers = torch.hub.load(
|
62 |
-
repo_or_dir='snakers4/silero-models',
|
63 |
-
model='silero_tts',
|
64 |
-
language=language,
|
65 |
-
speaker=speaker
|
66 |
-
)
|
67 |
-
return model, sample_rate, speaker
|
68 |
-
|
69 |
-
def synthesize_speech(text, language):
|
70 |
-
model, sample_rate, speaker = load_tts_model(language)
|
71 |
-
# Synthesize speech; the output is a NumPy array with the audio waveform.
|
72 |
-
audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
|
73 |
-
return audio, sample_rate
|
74 |
-
|
75 |
-
def save_audio(audio, sample_rate):
|
76 |
-
# Save audio to a temporary file and return its path.
|
77 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
|
78 |
-
sf.write(f.name, audio, sample_rate)
|
79 |
-
return f.name
|
80 |
-
|
81 |
-
def extract_text_from_image(image_array, languages):
|
82 |
-
reader = load_ocr_reader(languages)
|
83 |
-
results = reader.readtext(image_array)
|
84 |
-
# Concatenate detected text parts.
|
85 |
-
extracted_text = " ".join([res[1] for res in results])
|
86 |
-
return extracted_text
|
87 |
-
|
88 |
-
# ---------------------------
|
89 |
-
# Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
|
90 |
-
# ---------------------------
|
91 |
-
ocr_language_map = {
|
92 |
-
'en': 'en',
|
93 |
-
'es': 'es',
|
94 |
-
'ch': 'ch_sim',
|
95 |
-
'ar': 'ar'
|
96 |
-
}
|
97 |
-
|
98 |
-
# ---------------------------
|
99 |
-
# Streamlit App UI
|
100 |
-
# ---------------------------
|
101 |
-
st.title("Image-to-Audio Description App")
|
102 |
-
st.write("Upload an image or enter text to generate audio descriptions.")
|
103 |
-
|
104 |
-
# Select language for both OCR and TTS
|
105 |
-
language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
|
106 |
-
|
107 |
-
# Choose input method
|
108 |
-
input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
|
109 |
-
|
110 |
-
text = ""
|
111 |
-
|
112 |
-
if input_method == "Upload Image":
|
113 |
-
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
|
114 |
-
if uploaded_file is not None:
|
115 |
-
image = Image.open(uploaded_file)
|
116 |
-
st.image(image, caption='Uploaded Image', use_column_width=True)
|
117 |
-
# Convert PIL image to numpy array for EasyOCR
|
118 |
-
image_array = np.array(image)
|
119 |
-
with st.spinner("Extracting text from image..."):
|
120 |
-
# EasyOCR expects language codes; here we wrap our choice.
|
121 |
-
ocr_lang = [ocr_language_map.get(language, 'en')]
|
122 |
-
text = extract_text_from_image(image_array, ocr_lang)
|
123 |
-
st.write("**Extracted Text:**")
|
124 |
-
st.write(text)
|
125 |
-
else:
|
126 |
-
text = st.text_area("Enter text to synthesize", "Type your description here...")
|
127 |
-
|
128 |
-
if text and st.button("Generate Speech"):
|
129 |
-
with st.spinner("Synthesizing speech..."):
|
130 |
-
audio, sr = synthesize_speech(text, language)
|
131 |
-
audio_file = save_audio(audio, sr)
|
132 |
-
st.success("Audio generated!")
|
133 |
-
st.audio(audio_file)
|
|
|
1 |
+
import cv2
|
2 |
+
import gradio as gr
|
3 |
import numpy as np
|
4 |
+
from fer import FER
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
# Initialize the pre-trained detector once so you don't reinitialize on every function call.
|
7 |
+
detector = FER(mtcnn=True) # Optionally, you can set mtcnn to False to use a faster (but less accurate) cascade.
|
8 |
|
9 |
+
def emotion_recognition(image):
|
|
|
10 |
"""
|
11 |
+
Process the input image, detect emotions on faces,
|
12 |
+
and annotate the image with bounding boxes and emotion labels.
|
13 |
+
|
14 |
+
Parameters:
|
15 |
+
image (numpy.ndarray): Input image (RGB).
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
numpy.ndarray: Annotated image with emotion labels.
|
19 |
+
"""
|
20 |
+
# fer works with RGB images which is what Gradio provides by default.
|
21 |
+
results = detector.detect_emotions(image)
|
22 |
+
annotated_image = image.copy()
|
23 |
+
|
24 |
+
# Loop through each detected face
|
25 |
+
for face in results:
|
26 |
+
(x, y, w, h) = face["box"]
|
27 |
+
# Get the dominant emotion for the detected face
|
28 |
+
dominant_emotion = max(face["emotions"].items(), key=lambda item: item[1])[0]
|
29 |
+
# Draw bounding box around face
|
30 |
+
cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
31 |
+
# Put the emotion label above the bounding box
|
32 |
+
cv2.putText(annotated_image, dominant_emotion, (x, y - 10),
|
33 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
|
34 |
+
return annotated_image
|
35 |
+
|
36 |
+
# Create a Gradio Interface
|
37 |
+
interface = gr.Interface(
|
38 |
+
fn=emotion_recognition,
|
39 |
+
inputs=gr.Image(type="numpy", label="Input Image"),
|
40 |
+
outputs=gr.Image(type="numpy", label="Annotated Image"),
|
41 |
+
title="Facial Emotion Recognition",
|
42 |
+
description="Upload an image and let the app detect and annotate facial emotions."
|
43 |
)
|
44 |
|
45 |
+
# Run the app locally
|
46 |
+
if __name__ == "__main__":
|
47 |
+
interface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
soundfile
|
5 |
-
Pillow
|
6 |
numpy
|
7 |
-
omegaconf
|
|
|
1 |
+
gradio
|
2 |
+
opencv-python
|
3 |
+
fer
|
|
|
|
|
4 |
numpy
|
|