freddyaboulton HF Staff commited on
Commit
2ec08a6
·
verified ·
1 Parent(s): 3c84e3a

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +24 -17
  3. requirements.txt +0 -1
README.md CHANGED
@@ -9,7 +9,7 @@ app_file: app.py
9
  pinned: false
10
  license: mit
11
  short_description: Gemini understands audio and video!
12
- tags: [webrtc, websocket, gradio, secret|TWILIO_ACCOUNT_SID, secret|TWILIO_AUTH_TOKEN, secret|GEMINI_API_KEY]
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  license: mit
11
  short_description: Gemini understands audio and video!
12
+ tags: [webrtc, websocket, gradio, secret|HF_TOKEN secret|GEMINI_API_KEY]
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -5,16 +5,18 @@ import time
5
  from io import BytesIO
6
 
7
  import gradio as gr
8
- from gradio.utils import get_space
9
  import numpy as np
10
- from google import genai
11
  from dotenv import load_dotenv
12
  from fastrtc import (
13
  AsyncAudioVideoStreamHandler,
14
  Stream,
15
- get_twilio_turn_credentials,
16
  WebRTC,
 
 
17
  )
 
 
18
  from PIL import Image
19
 
20
  load_dotenv()
@@ -44,12 +46,10 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
44
  super().__init__(
45
  "mono",
46
  output_sample_rate=24000,
47
- output_frame_size=480,
48
  input_sample_rate=16000,
49
  )
50
  self.audio_queue = asyncio.Queue()
51
  self.video_queue = asyncio.Queue()
52
- self.quit = asyncio.Event()
53
  self.session = None
54
  self.last_frame_time = 0
55
  self.quit = asyncio.Event()
@@ -69,10 +69,14 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
69
  print("set session")
70
  while not self.quit.is_set():
71
  turn = self.session.receive()
72
- async for response in turn:
73
- if data := response.data:
74
- audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
 
75
  self.audio_queue.put_nowait(audio)
 
 
 
76
 
77
  async def video_receive(self, frame: np.ndarray):
78
  if self.session:
@@ -87,7 +91,11 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
87
  self.video_queue.put_nowait(frame)
88
 
89
  async def video_emit(self):
90
- return await self.video_queue.get()
 
 
 
 
91
 
92
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
93
  _, array = frame
@@ -97,13 +105,14 @@ class GeminiHandler(AsyncAudioVideoStreamHandler):
97
  await self.session.send(input=audio_message)
98
 
99
  async def emit(self):
100
- array = await self.audio_queue.get()
101
- return (self.output_sample_rate, array)
 
102
 
103
  async def shutdown(self) -> None:
104
  if self.session:
105
  self.quit.set()
106
- await self.session._websocket.close()
107
  self.quit.clear()
108
 
109
 
@@ -111,9 +120,7 @@ stream = Stream(
111
  handler=GeminiHandler(),
112
  modality="audio-video",
113
  mode="send-receive",
114
- rtc_configuration=get_twilio_turn_credentials()
115
- if get_space()
116
- else None,
117
  time_limit=90 if get_space() else None,
118
  additional_inputs=[
119
  gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
@@ -153,8 +160,8 @@ with gr.Blocks(css=css) as demo:
153
  modality="audio-video",
154
  mode="send-receive",
155
  elem_id="video-source",
156
- rtc_configuration=get_twilio_turn_credentials()
157
- if get_space()
158
  else None,
159
  icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
160
  pulse_color="rgb(255, 255, 255)",
 
5
  from io import BytesIO
6
 
7
  import gradio as gr
 
8
  import numpy as np
9
+ import websockets
10
  from dotenv import load_dotenv
11
  from fastrtc import (
12
  AsyncAudioVideoStreamHandler,
13
  Stream,
 
14
  WebRTC,
15
+ get_turn_credentials_async,
16
+ wait_for_item,
17
  )
18
+ from google import genai
19
+ from gradio.utils import get_space
20
  from PIL import Image
21
 
22
  load_dotenv()
 
46
  super().__init__(
47
  "mono",
48
  output_sample_rate=24000,
 
49
  input_sample_rate=16000,
50
  )
51
  self.audio_queue = asyncio.Queue()
52
  self.video_queue = asyncio.Queue()
 
53
  self.session = None
54
  self.last_frame_time = 0
55
  self.quit = asyncio.Event()
 
69
  print("set session")
70
  while not self.quit.is_set():
71
  turn = self.session.receive()
72
+ try:
73
+ async for response in turn:
74
+ if data := response.data:
75
+ audio = np.frombuffer(data, dtype=np.int16).reshape(1, -1)
76
  self.audio_queue.put_nowait(audio)
77
+ except websockets.exceptions.ConnectionClosedOK:
78
+ print("connection closed")
79
+ break
80
 
81
  async def video_receive(self, frame: np.ndarray):
82
  if self.session:
 
91
  self.video_queue.put_nowait(frame)
92
 
93
  async def video_emit(self):
94
+ frame = await wait_for_item(self.video_queue)
95
+ if frame is not None:
96
+ return frame
97
+ else:
98
+ return np.zeros((100, 100, 3), dtype=np.uint8)
99
 
100
  async def receive(self, frame: tuple[int, np.ndarray]) -> None:
101
  _, array = frame
 
105
  await self.session.send(input=audio_message)
106
 
107
  async def emit(self):
108
+ array = await wait_for_item(self.audio_queue)
109
+ if array is not None:
110
+ return (self.output_sample_rate, array)
111
 
112
  async def shutdown(self) -> None:
113
  if self.session:
114
  self.quit.set()
115
+ await self.session.close()
116
  self.quit.clear()
117
 
118
 
 
120
  handler=GeminiHandler(),
121
  modality="audio-video",
122
  mode="send-receive",
123
+ rtc_configuration=get_turn_credentials_async if get_space() == "spaces" else None,
 
 
124
  time_limit=90 if get_space() else None,
125
  additional_inputs=[
126
  gr.Image(label="Image", type="numpy", sources=["upload", "clipboard"])
 
160
  modality="audio-video",
161
  mode="send-receive",
162
  elem_id="video-source",
163
+ rtc_configuration=get_turn_credentials_async
164
+ if get_space() == "spaces"
165
  else None,
166
  icon="https://www.gstatic.com/lamda/images/gemini_favicon_f069958c85030456e93de685481c559f160ea06b.png",
167
  pulse_color="rgb(255, 255, 255)",
requirements.txt CHANGED
@@ -2,4 +2,3 @@ fastrtc
2
  python-dotenv
3
  google-genai
4
  twilio
5
- pydantic==2.10.0
 
2
  python-dotenv
3
  google-genai
4
  twilio