notrey commited on
Commit
201eb03
·
1 Parent(s): c269716

Adding app file

Browse files
Files changed (1) hide show
  1. app.py +107 -0
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import numpy as np
4
+ import tempfile
5
+ import soundfile as sf
6
+ import torch
7
+ import easyocr
8
+
9
+ # ---------------------------
10
+ # Caching the OCR reader for performance
11
+ # ---------------------------
12
+ @st.cache_resource(show_spinner=False)
13
+ def load_ocr_reader(languages):
14
+ # EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
15
+ return easyocr.Reader(languages, gpu=False)
16
+
17
+ # ---------------------------
18
+ # Caching TTS model loading (Silero TTS)
19
+ # ---------------------------
20
+ @st.cache_resource(show_spinner=False)
21
+ def load_tts_model(language):
22
+ # Map our language codes to Silero model speakers.
23
+ # Note: Silero officially supports 'en' (and some community models for other languages).
24
+ # For demonstration, if a language isn’t available, we fallback to English.
25
+ lang_speaker_map = {
26
+ 'en': 'v3_en',
27
+ 'es': 'v3_es', # if available; otherwise, you might need to train or use an English model
28
+ 'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model)
29
+ 'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available)
30
+ }
31
+ speaker = lang_speaker_map.get(language, 'v3_en')
32
+ device = torch.device('cpu')
33
+ # Load the Silero TTS model from torch.hub.
34
+ # This command will download the model the first time you run it.
35
+ model, example_text, sample_rate, speakers = torch.hub.load(
36
+ repo_or_dir='snakers4/silero-models',
37
+ model='silero_tts',
38
+ language=language,
39
+ speaker=speaker
40
+ )
41
+ return model, sample_rate, speaker
42
+
43
+ def synthesize_speech(text, language):
44
+ model, sample_rate, speaker = load_tts_model(language)
45
+ # Synthesize speech; the output is a NumPy array with the audio waveform.
46
+ audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
47
+ return audio, sample_rate
48
+
49
+ def save_audio(audio, sample_rate):
50
+ # Save audio to a temporary file and return its path.
51
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
52
+ sf.write(f.name, audio, sample_rate)
53
+ return f.name
54
+
55
+ def extract_text_from_image(image_array, languages):
56
+ reader = load_ocr_reader(languages)
57
+ results = reader.readtext(image_array)
58
+ # Concatenate detected text parts.
59
+ extracted_text = " ".join([res[1] for res in results])
60
+ return extracted_text
61
+
62
+ # ---------------------------
63
+ # Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
64
+ # ---------------------------
65
+ ocr_language_map = {
66
+ 'en': 'en',
67
+ 'es': 'es',
68
+ 'ch': 'ch_sim',
69
+ 'ar': 'ar'
70
+ }
71
+
72
+ # ---------------------------
73
+ # Streamlit App UI
74
+ # ---------------------------
75
+ st.title("Image-to-Audio Description App")
76
+ st.write("Upload an image or enter text to generate audio descriptions.")
77
+
78
+ # Select language for both OCR and TTS
79
+ language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
80
+
81
+ # Choose input method
82
+ input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
83
+
84
+ text = ""
85
+
86
+ if input_method == "Upload Image":
87
+ uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
88
+ if uploaded_file is not None:
89
+ image = Image.open(uploaded_file)
90
+ st.image(image, caption='Uploaded Image', use_column_width=True)
91
+ # Convert PIL image to numpy array for EasyOCR
92
+ image_array = np.array(image)
93
+ with st.spinner("Extracting text from image..."):
94
+ # EasyOCR expects language codes; here we wrap our choice.
95
+ ocr_lang = [ocr_language_map.get(language, 'en')]
96
+ text = extract_text_from_image(image_array, ocr_lang)
97
+ st.write("**Extracted Text:**")
98
+ st.write(text)
99
+ else:
100
+ text = st.text_area("Enter text to synthesize", "Type your description here...")
101
+
102
+ if text and st.button("Generate Speech"):
103
+ with st.spinner("Synthesizing speech..."):
104
+ audio, sr = synthesize_speech(text, language)
105
+ audio_file = save_audio(audio, sr)
106
+ st.success("Audio generated!")
107
+ st.audio(audio_file)