CompVisProj / app.py
notrey's picture
Adding app file
201eb03
raw
history blame
4.09 kB
import streamlit as st
from PIL import Image
import numpy as np
import tempfile
import soundfile as sf
import torch
import easyocr
# ---------------------------
# Caching the OCR reader for performance
# ---------------------------
@st.cache_resource(show_spinner=False)
def load_ocr_reader(languages):
# EasyOCR expects language codes like "en", "es", "ch_sim", "ar"
return easyocr.Reader(languages, gpu=False)
# ---------------------------
# Caching TTS model loading (Silero TTS)
# ---------------------------
@st.cache_resource(show_spinner=False)
def load_tts_model(language):
# Map our language codes to Silero model speakers.
# Note: Silero officially supports 'en' (and some community models for other languages).
# For demonstration, if a language isn’t available, we fallback to English.
lang_speaker_map = {
'en': 'v3_en',
'es': 'v3_es', # if available; otherwise, you might need to train or use an English model
'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model)
'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available)
}
speaker = lang_speaker_map.get(language, 'v3_en')
device = torch.device('cpu')
# Load the Silero TTS model from torch.hub.
# This command will download the model the first time you run it.
model, example_text, sample_rate, speakers = torch.hub.load(
repo_or_dir='snakers4/silero-models',
model='silero_tts',
language=language,
speaker=speaker
)
return model, sample_rate, speaker
def synthesize_speech(text, language):
model, sample_rate, speaker = load_tts_model(language)
# Synthesize speech; the output is a NumPy array with the audio waveform.
audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate)
return audio, sample_rate
def save_audio(audio, sample_rate):
# Save audio to a temporary file and return its path.
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f:
sf.write(f.name, audio, sample_rate)
return f.name
def extract_text_from_image(image_array, languages):
reader = load_ocr_reader(languages)
results = reader.readtext(image_array)
# Concatenate detected text parts.
extracted_text = " ".join([res[1] for res in results])
return extracted_text
# ---------------------------
# Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified)
# ---------------------------
ocr_language_map = {
'en': 'en',
'es': 'es',
'ch': 'ch_sim',
'ar': 'ar'
}
# ---------------------------
# Streamlit App UI
# ---------------------------
st.title("Image-to-Audio Description App")
st.write("Upload an image or enter text to generate audio descriptions.")
# Select language for both OCR and TTS
language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0)
# Choose input method
input_method = st.radio("Input method", options=["Upload Image", "Enter Text"])
text = ""
if input_method == "Upload Image":
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
st.image(image, caption='Uploaded Image', use_column_width=True)
# Convert PIL image to numpy array for EasyOCR
image_array = np.array(image)
with st.spinner("Extracting text from image..."):
# EasyOCR expects language codes; here we wrap our choice.
ocr_lang = [ocr_language_map.get(language, 'en')]
text = extract_text_from_image(image_array, ocr_lang)
st.write("**Extracted Text:**")
st.write(text)
else:
text = st.text_area("Enter text to synthesize", "Type your description here...")
if text and st.button("Generate Speech"):
with st.spinner("Synthesizing speech..."):
audio, sr = synthesize_speech(text, language)
audio_file = save_audio(audio, sr)
st.success("Audio generated!")
st.audio(audio_file)