Spaces:

freddyaboulton
/

really-fast-whisper

Running on CPU Upgrade

App Files Files Community

freddyaboulton HF Staff commited on 11 days ago

Commit

bd507ca

verified ·

1 Parent(s): 0a71e28

Upload 2 files

Browse files

Files changed (2) hide show

app.py +10 -9
languages.py +102 -0

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from fastrtc import (
     get_turn_credentials,
 )
 from gradio.utils import get_space
 cur_dir = Path(__file__).parent
@@ -23,39 +24,39 @@ load_dotenv()
 client = AsyncClient(timeout=30)
-async def transcribe_file(audio: tuple[int, np.ndarray]):
     response = await client.post(
         url="https://douatiz8x2itm3yn.us-east-1.aws.endpoints.huggingface.cloud/api/v1/audio/transcriptions",
         headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"},
         files={"file": audio_to_bytes(audio)},
-        data={"response_format": "text"},
     )
     return response.text
-async def transcribe(audio: tuple[int, np.ndarray], transcript: str):
-    text = await transcribe_file(audio)
     yield AdditionalOutputs(transcript + " " + text)
 transcript = gr.Textbox(label="Transcript")
 stream = Stream(
-    ReplyOnPause(transcribe),
     modality="audio",
     mode="send",
-    additional_inputs=[transcript],
     additional_outputs=[transcript],
     additional_outputs_handler=lambda a, b: b,
-    rtc_configuration=get_turn_credentials_async if get_space() else None,
     server_rtc_configuration=get_turn_credentials(ttl=604_800),
     concurrency_limit=20 if get_space() else None,
-    time_limit=300,
     ui_args={"title": ""},
 )
 iface = gr.Interface(
     fn=transcribe_file,
-    inputs=[gr.Audio(label="Upload Audio", sources=["upload", "microphone"])],
     outputs=gr.Textbox(label="Transcript"),
 )

     get_turn_credentials,
 )
 from gradio.utils import get_space
+from languages import LANGUAGES
 cur_dir = Path(__file__).parent
 client = AsyncClient(timeout=30)
+async def transcribe_file(audio: tuple[int, np.ndarray], language: str):
     response = await client.post(
         url="https://douatiz8x2itm3yn.us-east-1.aws.endpoints.huggingface.cloud/api/v1/audio/transcriptions",
         headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"},
         files={"file": audio_to_bytes(audio)},
+        data={"response_format": "text", "language": language},
     )
     return response.text
+async def transcribe(audio: tuple[int, np.ndarray], transcript: str, language: str):
+    text = await transcribe_file(audio, language)
     yield AdditionalOutputs(transcript + " " + text)
 transcript = gr.Textbox(label="Transcript")
 stream = Stream(
+    ReplyOnPause(transcribe, input_sample_rate=48_100),
     modality="audio",
     mode="send",
+    additional_inputs=[transcript, gr.Dropdown(choices=LANGUAGES, label="Language")],
     additional_outputs=[transcript],
     additional_outputs_handler=lambda a, b: b,
+    rtc_configuration=get_turn_credentials_async,
     server_rtc_configuration=get_turn_credentials(ttl=604_800),
     concurrency_limit=20 if get_space() else None,
+    time_limit=300,
     ui_args={"title": ""},
 )
 iface = gr.Interface(
     fn=transcribe_file,
+    inputs=[gr.Audio(label="Upload Audio", sources=["upload", "microphone"]), gr.Dropdown(choices=LANGUAGES, label="Language")],
     outputs=gr.Textbox(label="Transcript"),
 )

languages.py ADDED Viewed

	@@ -0,0 +1,102 @@

+LANGUAGES = [
+    ("English", "<|en|>"),
+    ("Chinese", "<|zh|>"),
+    ("German", "<|de|>"),
+    ("Spanish", "<|es|>"),
+    ("Russian", "<|ru|>"),
+    ("Korean", "<|ko|>"),
+    ("French", "<|fr|>"),
+    ("Japanese", "<|ja|>"),
+    ("Portuguese", "<|pt|>"),
+    ("Turkish", "<|tr|>"),
+    ("Polish", "<|pl|>"),
+    ("Catalan", "<|ca|>"),
+    ("Dutch", "<|nl|>"),
+    ("Arabic", "<|ar|>"),
+    ("Swedish", "<|sv|>"),
+    ("Italian", "<|it|>"),
+    ("Indonesian", "<|id|>"),
+    ("Hindi", "<|hi|>"),
+    ("Finnish", "<|fi|>"),
+    ("Vietnamese", "<|vi|>"),
+    ("Hebrew", "<|he|>"),
+    ("Ukrainian", "<|uk|>"),
+    ("Greek", "<|el|>"),
+    ("Malay", "<|ms|>"),
+    ("Czech", "<|cs|>"),
+    ("Romanian", "<|ro|>"),
+    ("Danish", "<|da|>"),
+    ("Hungarian", "<|hu|>"),
+    ("Tamil", "<|ta|>"),
+    ("Norwegian", "<|no|>"),
+    ("Thai", "<|th|>"),
+    ("Urdu", "<|ur|>"),
+    ("Croatian", "<|hr|>"),
+    ("Bulgarian", "<|bg|>"),
+    ("Lithuanian", "<|lt|>"),
+    ("Latin", "<|la|>"),
+    ("Maori", "<|mi|>"),
+    ("Malayalam", "<|ml|>"),
+    ("Welsh", "<|cy|>"),
+    ("Slovak", "<|sk|>"),
+    ("Telugu", "<|te|>"),
+    ("Persian", "<|fa|>"),
+    ("Latvian", "<|lv|>"),
+    ("Bengali", "<|bn|>"),
+    ("Serbian", "<|sr|>"),
+    ("Azerbaijani", "<|az|>"),
+    ("Slovenian", "<|sl|>"),
+    ("Kannada", "<|kn|>"),
+    ("Estonian", "<|et|>"),
+    ("Macedonian", "<|mk|>"),
+    ("Breton", "<|br|>"),
+    ("Basque", "<|eu|>"),
+    ("Icelandic", "<|is|>"),
+    ("Armenian", "<|hy|>"),
+    ("Nepali", "<|ne|>"),
+    ("Mongolian", "<|mn|>"),
+    ("Bosnian", "<|bs|>"),
+    ("Kazakh", "<|kk|>"),
+    ("Albanian", "<|sq|>"),
+    ("Swahili", "<|sw|>"),
+    ("Galician", "<|gl|>"),
+    ("Marathi", "<|mr|>"),
+    ("Punjabi", "<|pa|>"),
+    ("Sinhala", "<|si|>"),
+    ("Khmer", "<|km|>"),
+    ("Shona", "<|sn|>"),
+    ("Yoruba", "<|yo|>"),
+    ("Somali", "<|so|>"),
+    ("Afrikaans", "<|af|>"),
+    ("Occitan", "<|oc|>"),
+    ("Georgian", "<|ka|>"),
+    ("Belarusian", "<|be|>"),
+    ("Tajik", "<|tg|>"),
+    ("Sindhi", "<|sd|>"),
+    ("Gujarati", "<|gu|>"),
+    ("Amharic", "<|am|>"),
+    ("Yiddish", "<|yi|>"),
+    ("Lao", "<|lo|>"),
+    ("Uzbek", "<|uz|>"),
+    ("Faroese", "<|fo|>"),
+    ("Haitian Creole", "<|ht|>"),
+    ("Pashto", "<|ps|>"),
+    ("Turkmen", "<|tk|>"),
+    ("Norwegian Nynorsk", "<|nn|>"),
+    ("Maltese", "<|mt|>"),
+    ("Sanskrit", "<|sa|>"),
+    ("Luxembourgish", "<|lb|>"),
+    ("Burmese", "<|my|>"),
+    ("Tibetan", "<|bo|>"),
+    ("Tagalog", "<|tl|>"),
+    ("Malagasy", "<|mg|>"),
+    ("Assamese", "<|as|>"),
+    ("Tatar", "<|tt|>"),
+    ("Hawaiian", "<|haw|>"),
+    ("Lingala", "<|ln|>"),
+    ("Hausa", "<|ha|>"),
+    ("Bashkir", "<|ba|>"),
+    ("Javanese", "<|jw|>"),
+    ("Sundanese", "<|su|>"),
+    ("Cantonese", "<|yue|>"),
+]