Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,10 +7,10 @@ import time
|
|
7 |
import re
|
8 |
|
9 |
from openai import OpenAI
|
|
|
10 |
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
|
11 |
|
12 |
# ------------------ Load API Key ------------------
|
13 |
-
from dotenv import load_dotenv
|
14 |
load_dotenv()
|
15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
@@ -56,14 +56,20 @@ def process_chat(message, history, session_id):
|
|
56 |
time.sleep(1)
|
57 |
|
58 |
messages = client.beta.threads.messages.list(thread_id=thread_id)
|
|
|
59 |
for msg in reversed(messages.data):
|
60 |
if msg.role == "assistant":
|
61 |
assistant_response = msg.content[0].text.value
|
62 |
break
|
63 |
-
else:
|
64 |
-
assistant_response = "β οΈ Assistant did not respond."
|
65 |
|
66 |
-
return assistant_response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
# ------------------ Transcription Logic ------------------
|
69 |
def create_websocket_client():
|
@@ -84,33 +90,49 @@ def send_audio_chunk(audio, client_id):
|
|
84 |
connections[client_id].enqueue_audio_chunk(sr, y)
|
85 |
return connections[client_id].transcript
|
86 |
|
87 |
-
# ------------------ Gradio
|
88 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
89 |
gr.Markdown("# π§ Document AI + ποΈ Voice Assistant")
|
90 |
|
91 |
session_id = gr.State(value=reset_session())
|
92 |
client_id = gr.State()
|
|
|
93 |
|
94 |
-
# ---------- Section 1: Chat Interface ----------
|
95 |
with gr.Row():
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
[
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
#
|
107 |
gr.Markdown("## ποΈ Realtime Voice Transcription")
|
108 |
|
109 |
with gr.Row():
|
110 |
transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
|
111 |
-
|
112 |
with gr.Row():
|
113 |
-
mic_input = gr.Audio(streaming=True)
|
114 |
clear_button = gr.Button("Clear Transcript")
|
115 |
|
116 |
mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
|
|
|
7 |
import re
|
8 |
|
9 |
from openai import OpenAI
|
10 |
+
from dotenv import load_dotenv
|
11 |
from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
|
12 |
|
13 |
# ------------------ Load API Key ------------------
|
|
|
14 |
load_dotenv()
|
15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
ASSISTANT_ID = os.getenv("ASSISTANT_ID")
|
|
|
56 |
time.sleep(1)
|
57 |
|
58 |
messages = client.beta.threads.messages.list(thread_id=thread_id)
|
59 |
+
assistant_response = "β οΈ Assistant did not respond."
|
60 |
for msg in reversed(messages.data):
|
61 |
if msg.role == "assistant":
|
62 |
assistant_response = msg.content[0].text.value
|
63 |
break
|
|
|
|
|
64 |
|
65 |
+
return assistant_response
|
66 |
+
|
67 |
+
def extract_image_url(text):
|
68 |
+
match = re.search(
|
69 |
+
r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
|
70 |
+
text
|
71 |
+
)
|
72 |
+
return match.group(0) if match else None
|
73 |
|
74 |
# ------------------ Transcription Logic ------------------
|
75 |
def create_websocket_client():
|
|
|
90 |
connections[client_id].enqueue_audio_chunk(sr, y)
|
91 |
return connections[client_id].transcript
|
92 |
|
93 |
+
# ------------------ Gradio App ------------------
|
94 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
95 |
gr.Markdown("# π§ Document AI + ποΈ Voice Assistant")
|
96 |
|
97 |
session_id = gr.State(value=reset_session())
|
98 |
client_id = gr.State()
|
99 |
+
image_url = gr.State(value=None)
|
100 |
|
|
|
101 |
with gr.Row():
|
102 |
+
with gr.Column(scale=1):
|
103 |
+
image_display = gr.Image(label="π Extracted Document Image", show_label=True, height=400)
|
104 |
+
with gr.Column(scale=2):
|
105 |
+
chatbot = gr.ChatInterface(
|
106 |
+
fn=lambda message, history, session_id: process_chat(message, history, session_id),
|
107 |
+
additional_inputs=[session_id],
|
108 |
+
examples=[
|
109 |
+
["What does clause 3.2 mean?"],
|
110 |
+
["Summarize the timeline from the image."]
|
111 |
+
],
|
112 |
+
title="π¬ Document Assistant"
|
113 |
+
)
|
114 |
+
|
115 |
+
# Inject logic to extract image when assistant replies
|
116 |
+
def handle_reply_and_update_image(message, history, session_id):
|
117 |
+
response = process_chat(message, history, session_id)
|
118 |
+
url = extract_image_url(response)
|
119 |
+
return response, url
|
120 |
+
|
121 |
+
chatbot.fn = lambda message, history, session_id: handle_reply_and_update_image(message, history, session_id)[0]
|
122 |
+
chatbot.chatbot.change(
|
123 |
+
fn=lambda m, h, s: handle_reply_and_update_image(m, h, s)[1],
|
124 |
+
inputs=[chatbot.input, chatbot.chatbot, session_id],
|
125 |
+
outputs=image_display
|
126 |
+
)
|
127 |
|
128 |
+
# ------------------ Voice Transcription ------------------
|
129 |
gr.Markdown("## ποΈ Realtime Voice Transcription")
|
130 |
|
131 |
with gr.Row():
|
132 |
transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
|
133 |
+
|
134 |
with gr.Row():
|
135 |
+
mic_input = gr.Audio(streaming=True)
|
136 |
clear_button = gr.Button("Clear Transcript")
|
137 |
|
138 |
mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
|