documentaitestv3

Running

App Files Files Community

IAMTFRMZA commited on 16 days ago

Commit

d69e46f

verified ·

1 Parent(s): dbf9e7a

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -25

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from openai import OpenAI
 from websockets import connect, Data, ClientConnection
 from dotenv import load_dotenv
-# ---------------- Environment & Client Setup ----------------
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -17,7 +17,7 @@ HEADERS = {"Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
-# ---------------- WebSocket Client for Voice ----------------
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri, self.headers, self.client_id = uri, headers, client_id
@@ -45,7 +45,10 @@ class WebSocketClient:
             buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
             audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
             out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
-            await self.websocket.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64.b64encode(out.read()).decode()}))
     async def receive_messages(self):
         async for msg in self.websocket:
@@ -74,7 +77,7 @@ def clear_transcript(cid):
     if cid in connections: connections[cid].transcript = ""
     return ""
-# ---------------- Chat Functionality ----------------
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
@@ -97,7 +100,10 @@ def handle_chat(user_input, history, thread_id, image_url):
             if msg.role == "assistant":
                 content = msg.content[0].text.value
                 history.append((user_input, content))
-                match = re.search(r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png', content)
                 if match: image_url = match.group(0)
                 break
@@ -106,7 +112,7 @@ def handle_chat(user_input, history, thread_id, image_url):
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
-# ---------------- Gradio UI Layout ----------------
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
@@ -115,31 +121,37 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
-    with gr.Row():
         with gr.Column(scale=1):
-            # IMAGE VIEWER (left)
-            image_display = gr.Image(label="🖼️ Document", type="filepath")
-            # VOICE (under)
-            voice_transcript = gr.Textbox(label="🎙️ Transcript", lines=4, interactive=False)
-            voice_input = gr.Audio(label="🔴 Record", streaming=True)
-            clear_btn = gr.Button("🧹 Clear Transcript")
-        with gr.Column(scale=2):
-            # CHATBOT (right)
-            chat = gr.Chatbot(label="💬 Chat", height=450)
-            user_prompt = gr.Textbox(show_label=False, placeholder="Ask your question...")
-            send_btn = gr.Button("Send")
-    # HANDLERS
-    send_btn.click(handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
-    app.load(create_ws, outputs=[client_id])
 app.launch()

 from websockets import connect, Data, ClientConnection
 from dotenv import load_dotenv
+# ============ Load Secrets ============
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
 WS_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
 connections = {}
+# ============ WebSocket Client ============
 class WebSocketClient:
     def __init__(self, uri, headers, client_id):
         self.uri, self.headers, self.client_id = uri, headers, client_id
             buf = io.BytesIO(); sf.write(buf, int16, sr, format='WAV', subtype='PCM_16')
             audio = AudioSegment.from_file(buf, format="wav").set_frame_rate(24000)
             out = io.BytesIO(); audio.export(out, format="wav"); out.seek(0)
+            await self.websocket.send(json.dumps({
+                "type": "input_audio_buffer.append",
+                "audio": base64.b64encode(out.read()).decode()
+            }))
     async def receive_messages(self):
         async for msg in self.websocket:
     if cid in connections: connections[cid].transcript = ""
     return ""
+# ============ Chat Assistant ============
 def handle_chat(user_input, history, thread_id, image_url):
     if not OPENAI_API_KEY or not ASSISTANT_ID:
         return "❌ Missing secrets!", history, thread_id, image_url
             if msg.role == "assistant":
                 content = msg.content[0].text.value
                 history.append((user_input, content))
+                match = re.search(
+                    r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
+                    content
+                )
                 if match: image_url = match.group(0)
                 break
     except Exception as e:
         return f"❌ {e}", history, thread_id, image_url
+# ============ Gradio UI ============
 with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# 📄 Document AI Assistant")
     thread_state = gr.State()
     image_state = gr.State()
     client_id = gr.State()
+    voice_enabled = gr.State(False)
+    with gr.Row(equal_height=True):
         with gr.Column(scale=1):
+            image_display = gr.Image(label="🖼️ Document", type="filepath", show_download_button=False)
+        with gr.Column(scale=1.4):
+            chat = gr.Chatbot(label="💬 Chat", height=460)
+            with gr.Row():
+                user_prompt = gr.Textbox(placeholder="Ask your question...", show_label=False, scale=6)
+                mic_toggle_btn = gr.Button("🎙️", scale=1)
+                send_btn = gr.Button("Send", variant="primary", scale=2)
+            with gr.Accordion("🎤 Voice Transcription", open=False) as voice_section:
+                with gr.Row():
+                    voice_input = gr.Audio(label="Mic", streaming=True)
+                    voice_transcript = gr.Textbox(label="Transcript", lines=2, interactive=False)
+                clear_btn = gr.Button("🧹 Clear Transcript")
+    # FUNCTIONAL CONNECTIONS
+    def toggle_voice(curr):
+        return not curr, gr.update(visible=not curr)
+    mic_toggle_btn.click(fn=toggle_voice, inputs=voice_enabled, outputs=[voice_enabled, voice_section])
+    send_btn.click(fn=handle_chat,
                    inputs=[user_prompt, chat_state, thread_state, image_state],
                    outputs=[user_prompt, chat, thread_state, image_state])
     image_state.change(fn=lambda x: x, inputs=image_state, outputs=image_display)
     voice_input.stream(fn=send_audio, inputs=[voice_input, client_id], outputs=voice_transcript, stream_every=0.5)
     clear_btn.click(fn=clear_transcript, inputs=[client_id], outputs=voice_transcript)
+    app.load(fn=create_ws, outputs=[client_id])
 app.launch()