Spaces:

IAMTFRMZA
/

documentaitestv2

Running

App Files Files Community

IAMTFRMZA commited on 15 days ago

Commit

b74ae51

verified ·

1 Parent(s): 95695d7

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -18

app.py CHANGED Viewed

@@ -7,10 +7,10 @@ import time
 import re
 from openai import OpenAI
 from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
 # ------------------ Load API Key ------------------
-from dotenv import load_dotenv
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
@@ -56,14 +56,20 @@ def process_chat(message, history, session_id):
         time.sleep(1)
     messages = client.beta.threads.messages.list(thread_id=thread_id)
     for msg in reversed(messages.data):
         if msg.role == "assistant":
             assistant_response = msg.content[0].text.value
             break
-    else:
-        assistant_response = "⚠️ Assistant did not respond."
-    return assistant_response  # ✅ only returning text now
 # ------------------ Transcription Logic ------------------
 def create_websocket_client():
@@ -84,33 +90,49 @@ def send_audio_chunk(audio, client_id):
     connections[client_id].enqueue_audio_chunk(sr, y)
     return connections[client_id].transcript
-# ------------------ Gradio Interface ------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧠 Document AI + 🎙️ Voice Assistant")
     session_id = gr.State(value=reset_session())
     client_id = gr.State()
-    # ---------- Section 1: Chat Interface ----------
     with gr.Row():
-        chatbot = gr.ChatInterface(
-            fn=lambda message, history, session_id: process_chat(message, history, session_id),
-            additional_inputs=[session_id],
-            examples=[
-                ["What does clause 3.2 mean?"],
-                ["Summarize the timeline from the image."]
-            ],
-            title="💬 Document Assistant"
-        )
-    # ---------- Section 2: Voice Transcription ----------
     gr.Markdown("## 🎙️ Realtime Voice Transcription")
     with gr.Row():
         transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
     with gr.Row():
-        mic_input = gr.Audio(streaming=True)  # ✅ fixed for Hugging Face compatibility
         clear_button = gr.Button("Clear Transcript")
     mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)

 import re
 from openai import OpenAI
+from dotenv import load_dotenv
 from realtime_transcriber import WebSocketClient, connections, WEBSOCKET_URI, WEBSOCKET_HEADERS
 # ------------------ Load API Key ------------------
 load_dotenv()
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 ASSISTANT_ID = os.getenv("ASSISTANT_ID")
         time.sleep(1)
     messages = client.beta.threads.messages.list(thread_id=thread_id)
+    assistant_response = "⚠️ Assistant did not respond."
     for msg in reversed(messages.data):
         if msg.role == "assistant":
             assistant_response = msg.content[0].text.value
             break
+    return assistant_response
+def extract_image_url(text):
+    match = re.search(
+        r'https://raw\.githubusercontent\.com/AndrewLORTech/surgical-pathology-manual/main/[\w\-/]*\.png',
+        text
+    )
+    return match.group(0) if match else None
 # ------------------ Transcription Logic ------------------
 def create_websocket_client():
     connections[client_id].enqueue_audio_chunk(sr, y)
     return connections[client_id].transcript
+# ------------------ Gradio App ------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🧠 Document AI + 🎙️ Voice Assistant")
     session_id = gr.State(value=reset_session())
     client_id = gr.State()
+    image_url = gr.State(value=None)
     with gr.Row():
+        with gr.Column(scale=1):
+            image_display = gr.Image(label="📑 Extracted Document Image", show_label=True, height=400)
+        with gr.Column(scale=2):
+            chatbot = gr.ChatInterface(
+                fn=lambda message, history, session_id: process_chat(message, history, session_id),
+                additional_inputs=[session_id],
+                examples=[
+                    ["What does clause 3.2 mean?"],
+                    ["Summarize the timeline from the image."]
+                ],
+                title="💬 Document Assistant"
+            )
+    # Inject logic to extract image when assistant replies
+    def handle_reply_and_update_image(message, history, session_id):
+        response = process_chat(message, history, session_id)
+        url = extract_image_url(response)
+        return response, url
+    chatbot.fn = lambda message, history, session_id: handle_reply_and_update_image(message, history, session_id)[0]
+    chatbot.chatbot.change(
+        fn=lambda m, h, s: handle_reply_and_update_image(m, h, s)[1],
+        inputs=[chatbot.input, chatbot.chatbot, session_id],
+        outputs=image_display
+    )
+    # ------------------ Voice Transcription ------------------
     gr.Markdown("## 🎙️ Realtime Voice Transcription")
     with gr.Row():
         transcript_box = gr.Textbox(label="Live Transcript", lines=7, interactive=False, autoscroll=True)
     with gr.Row():
+        mic_input = gr.Audio(streaming=True)
         clear_button = gr.Button("Clear Transcript")
     mic_input.stream(fn=send_audio_chunk, inputs=[mic_input, client_id], outputs=transcript_box)