IAMTFRMZA commited on
Commit
b74ae51
Β·
verified Β·
1 Parent(s): 95695d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -18
app.py CHANGED
@@ -7,10 +7,10 @@ import time
7
  import re
8
 
9
  from openai import OpenAI
 
10
  from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
11
 
12
  # ------------------ Load API Key ------------------
13
- from dotenv import load_dotenv
14
  load_dotenv()
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -56,14 +56,20 @@ def process_chat(message, history, session_id):
56
  time.sleep(1)
57
 
58
  messages = client.beta.threads.messages.list(thread_id=thread_id)
 
59
  for msg in reversed(messages.data):
60
  if msg.role == "assistant":
61
  assistant_response = msg.content[0].text.value
62
  break
63
- else:
64
- assistant_response = "⚠️ Assistant did not respond."
65
 
66
- return assistant_response # βœ… only returning text now
 
 
 
 
 
 
 
67
 
68
  # ------------------ Transcription Logic ------------------
69
  def create_websocket_client():
@@ -84,33 +90,49 @@ def send_audio_chunk(audio, client_id):
84
  connections[client_id].enqueue_audio_chunk(sr, y)
85
  return connections[client_id].transcript
86
 
87
- # ------------------ Gradio Interface ------------------
88
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
89
  gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")
90
 
91
  session_id = gr.State(value=reset_session())
92
  client_id = gr.State()
 
93
 
94
- # ---------- Section 1: Chat Interface ----------
95
  with gr.Row():
96
- chatbot = gr.ChatInterface(
97
- fn=lambda message, history, session_id: process_chat(message, history, session_id),
98
- additional_inputs=[session_id],
99
- examples=[
100
- ["What does clause 3.2 mean?"],
101
- ["Summarize the timeline from the image."]
102
- ],
103
- title="πŸ’¬ Document Assistant"
104
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- # ---------- Section 2: Voice Transcription ----------
107
  gr.Markdown("## πŸŽ™οΈ Realtime Voice Transcription")
108
 
109
  with gr.Row():
110
  transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
111
-
112
  with gr.Row():
113
- mic_input = gr.Audio(streaming=True) # βœ… fixed for Hugging Face compatibility
114
  clear_button = gr.Button("Clear Transcript")
115
 
116
  mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)
 
7
  import re
8
 
9
  from openai import OpenAI
10
+ from dotenv import load_dotenv
11
  from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
12
 
13
  # ------------------ Load API Key ------------------
 
14
  load_dotenv()
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
  ASSISTANT_ID = os.getenv("ASSISTANT_ID")
 
56
  time.sleep(1)
57
 
58
  messages = client.beta.threads.messages.list(thread_id=thread_id)
59
+ assistant_response = "⚠️ Assistant did not respond."
60
  for msg in reversed(messages.data):
61
  if msg.role == "assistant":
62
  assistant_response = msg.content[0].text.value
63
  break
 
 
64
 
65
+ return assistant_response
66
+
67
+ def extract_image_url(text):
68
+ match = re.search(
69
+ r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
70
+ text
71
+ )
72
+ return match.group(0) if match else None
73
 
74
  # ------------------ Transcription Logic ------------------
75
  def create_websocket_client():
 
90
  connections[client_id].enqueue_audio_chunk(sr, y)
91
  return connections[client_id].transcript
92
 
93
+ # ------------------ Gradio App ------------------
94
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
95
  gr.Markdown("# 🧠 Document AI + πŸŽ™οΈ Voice Assistant")
96
 
97
  session_id = gr.State(value=reset_session())
98
  client_id = gr.State()
99
+ image_url = gr.State(value=None)
100
 
 
101
  with gr.Row():
102
+ with gr.Column(scale=1):
103
+ image_display = gr.Image(label="πŸ“‘ Extracted Document Image", show_label=True, height=400)
104
+ with gr.Column(scale=2):
105
+ chatbot = gr.ChatInterface(
106
+ fn=lambda message, history, session_id: process_chat(message, history, session_id),
107
+ additional_inputs=[session_id],
108
+ examples=[
109
+ ["What does clause 3.2 mean?"],
110
+ ["Summarize the timeline from the image."]
111
+ ],
112
+ title="πŸ’¬ Document Assistant"
113
+ )
114
+
115
+ # Inject logic to extract image when assistant replies
116
+ def handle_reply_and_update_image(message, history, session_id):
117
+ response = process_chat(message, history, session_id)
118
+ url = extract_image_url(response)
119
+ return response, url
120
+
121
+ chatbot.fn = lambda message, history, session_id: handle_reply_and_update_image(message, history, session_id)[0]
122
+ chatbot.chatbot.change(
123
+ fn=lambda m, h, s: handle_reply_and_update_image(m, h, s)[1],
124
+ inputs=[chatbot.input, chatbot.chatbot, session_id],
125
+ outputs=image_display
126
+ )
127
 
128
+ # ------------------ Voice Transcription ------------------
129
  gr.Markdown("## πŸŽ™οΈ Realtime Voice Transcription")
130
 
131
  with gr.Row():
132
  transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
133
+
134
  with gr.Row():
135
+ mic_input = gr.Audio(streaming=True)
136
  clear_button = gr.Button("Clear Transcript")
137
 
138
  mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)