File size: 2,558 Bytes
7be7ac1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f28d146
 
 
 
 
 
 
 
7be7ac1
 
 
 
 
f28d146
 
7be7ac1
f28d146
7be7ac1
 
 
 
f28d146
7be7ac1
 
 
f28d146
7be7ac1
f28d146
7be7ac1
 
 
 
 
f28d146
 
7be7ac1
 
f28d146
 
7be7ac1
f28d146
7be7ac1
 
 
f28d146
7be7ac1
 
 
f28d146
7be7ac1
 
 
 
 
f28d146
7be7ac1
 
 
23cb5b2
 
 
f28d146
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import asyncio
import websockets
import streamlit as st
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import numpy as np
import torch
import soundfile as sf
import io

# Load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

async def recognize_speech(websocket):
    async for message in websocket:
        wf, samplerate = sf.read(io.BytesIO(message))
        input_values = tokenizer(wf, return_tensors="pt").input_values
        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.decode(predicted_ids[0])
        await websocket.send(transcription)

async def main_logic():
    async with websockets.serve(recognize_speech, "localhost", 8000):
        await asyncio.Future()  # run forever

# Create the streamlit interface
st.title("Real-Time ASR with Transformers.js")

# The script can't be run via "streamlit run" because that hangs asyncio loop
st.markdown("""
<script>
    const handleAudio = async (stream) => {
        const websocket = new WebSocket('ws://localhost:8000');
        const mediaRecorder = new MediaRecorder(stream, {mimeType: 'audio/webm'});
        const audioChunks = [];

        mediaRecorder.addEventListener("dataavailable", event => {
            console.log('dataavailable:', event.data);
            audioChunks.push(event.data);
            websocket.send(event.data);
        });

        websocket.onmessage = (event) => {
            const transcription = event.data;
            const transcriptionDiv = document.getElementById("transcription");
            transcriptionDiv.innerHTML = transcriptionDiv.innerHTML + transcription + "<br/>";
            console.log('Received:', transcription);
        };

        mediaRecorder.start(1000);

        websocket.onopen = () => {
            console.log('Connected to WebSocket');
        };

        websocket.onerror = (error) => {
            console.error('WebSocket Error:', error);
        };

        websocket.onclose = () => {
            console.log('WebSocket Closed');
        };
    };

    navigator.mediaDevices.getUserMedia({ audio: true })
        .then(handleAudio)
        .catch(error => console.error('getUserMedia Error:', error));
</script>

<div id="transcription">Your transcriptions will appear here:</div>
""", unsafe_allow_html=True)

if __name__ == "__main__":
    asyncio.run(main_logic())