Spaces:
Running
Running
import streamlit as st | |
from PIL import Image | |
import numpy as np | |
import tempfile | |
import soundfile as sf | |
import torch | |
import easyocr | |
# --------------------------- | |
# Caching the OCR reader for performance | |
# --------------------------- | |
def load_ocr_reader(languages): | |
# EasyOCR expects language codes like "en", "es", "ch_sim", "ar" | |
return easyocr.Reader(languages, gpu=False) | |
# --------------------------- | |
# Caching TTS model loading (Silero TTS) | |
# --------------------------- | |
def load_tts_model(language): | |
# Map our language codes to Silero model speakers. | |
# Note: Silero officially supports 'en' (and some community models for other languages). | |
# For demonstration, if a language isn’t available, we fallback to English. | |
lang_speaker_map = { | |
'en': 'v3_en', | |
'es': 'v3_es', # if available; otherwise, you might need to train or use an English model | |
'ch': 'v3_en', # fallback to English for now (or replace with an experimental Chinese model) | |
'ar': 'v3_en' # fallback to English (or an experimental Arabic model if available) | |
} | |
speaker = lang_speaker_map.get(language, 'v3_en') | |
device = torch.device('cpu') | |
# Load the Silero TTS model from torch.hub. | |
# This command will download the model the first time you run it. | |
model, example_text, sample_rate, speakers = torch.hub.load( | |
repo_or_dir='snakers4/silero-models', | |
model='silero_tts', | |
language=language, | |
speaker=speaker | |
) | |
return model, sample_rate, speaker | |
def synthesize_speech(text, language): | |
model, sample_rate, speaker = load_tts_model(language) | |
# Synthesize speech; the output is a NumPy array with the audio waveform. | |
audio = model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate) | |
return audio, sample_rate | |
def save_audio(audio, sample_rate): | |
# Save audio to a temporary file and return its path. | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as f: | |
sf.write(f.name, audio, sample_rate) | |
return f.name | |
def extract_text_from_image(image_array, languages): | |
reader = load_ocr_reader(languages) | |
results = reader.readtext(image_array) | |
# Concatenate detected text parts. | |
extracted_text = " ".join([res[1] for res in results]) | |
return extracted_text | |
# --------------------------- | |
# Mapping for EasyOCR language codes (EasyOCR uses 'ch_sim' for Chinese simplified) | |
# --------------------------- | |
ocr_language_map = { | |
'en': 'en', | |
'es': 'es', | |
'ch': 'ch_sim', | |
'ar': 'ar' | |
} | |
# --------------------------- | |
# Streamlit App UI | |
# --------------------------- | |
st.title("Image-to-Audio Description App") | |
st.write("Upload an image or enter text to generate audio descriptions.") | |
# Select language for both OCR and TTS | |
language = st.selectbox("Select language", options=['en', 'es', 'ch', 'ar'], index=0) | |
# Choose input method | |
input_method = st.radio("Input method", options=["Upload Image", "Enter Text"]) | |
text = "" | |
if input_method == "Upload Image": | |
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"]) | |
if uploaded_file is not None: | |
image = Image.open(uploaded_file) | |
st.image(image, caption='Uploaded Image', use_column_width=True) | |
# Convert PIL image to numpy array for EasyOCR | |
image_array = np.array(image) | |
with st.spinner("Extracting text from image..."): | |
# EasyOCR expects language codes; here we wrap our choice. | |
ocr_lang = [ocr_language_map.get(language, 'en')] | |
text = extract_text_from_image(image_array, ocr_lang) | |
st.write("**Extracted Text:**") | |
st.write(text) | |
else: | |
text = st.text_area("Enter text to synthesize", "Type your description here...") | |
if text and st.button("Generate Speech"): | |
with st.spinner("Synthesizing speech..."): | |
audio, sr = synthesize_speech(text, language) | |
audio_file = save_audio(audio, sr) | |
st.success("Audio generated!") | |
st.audio(audio_file) | |