Qwen2.5-Omni-7B-Demo

Running on Zero

App Files Files Community

aimeri commited on Mar 28

Commit

6a119c7

1 Parent(s): 9e14c66

Refactor chat history handling in process_input function to ensure it initializes correctly when not a list; improve user message formatting for multimodal inputs in app.py.

Browse files

Files changed (1) hide show

app.py +22 -10

app.py CHANGED Viewed

@@ -46,9 +46,15 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     conversation = [SYSTEM_PROMPT]
     # Add previous chat history
-    for user_msg, bot_msg in chat_history:
-        conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
-        conversation.append({"role": "assistant", "content": bot_msg})
     # Add current user input
     conversation.append({"role": "user", "content": user_input_to_content(user_input)})
@@ -104,7 +110,7 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     text_response = text_response.strip()
     # Format user message for chat history display
-    user_message_for_display = text
     if image is not None:
         user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]"
     if audio is not None:
@@ -112,7 +118,13 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
     if video is not None:
         user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]"
     # Update chat history with properly formatted entries
     chat_history.append((user_message_for_display, text_response))
     # Prepare output
@@ -230,7 +242,7 @@ def create_demo():
         # Text input handling
         text_submit.click(
-            fn=lambda text: text,
             inputs=text_input,
             outputs=[chatbot],
             queue=False
@@ -243,15 +255,15 @@ def create_demo():
         # Multimodal input handling
         def prepare_multimodal_input(image, audio, video, text):
             # Create a display message that indicates what was uploaded
-            display_message = text or ""
             if image is not None:
-                display_message = (display_message + " " if display_message else "") + "[Image]"
             if audio is not None:
-                display_message = (display_message + " " if display_message else "") + "[Audio]"
             if video is not None:
-                display_message = (display_message + " " if display_message else "") + "[Video]"
-            if not display_message:
                 display_message = "Multimodal content"
             return display_message

     conversation = [SYSTEM_PROMPT]
     # Add previous chat history
+    if isinstance(chat_history, list):
+        for item in chat_history:
+            if isinstance(item, tuple) and len(item) == 2:
+                user_msg, bot_msg = item
+                conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
+                conversation.append({"role": "assistant", "content": bot_msg})
+    else:
+        # Initialize chat history if it's not a list
+        chat_history = []
     # Add current user input
     conversation.append({"role": "user", "content": user_input_to_content(user_input)})
     text_response = text_response.strip()
     # Format user message for chat history display
+    user_message_for_display = str(text) if text is not None else ""
     if image is not None:
         user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]"
     if audio is not None:
     if video is not None:
         user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]"
+    # If empty, provide a default message
+    if not user_message_for_display.strip():
+        user_message_for_display = "Multimodal input"
     # Update chat history with properly formatted entries
+    if not isinstance(chat_history, list):
+        chat_history = []
     chat_history.append((user_message_for_display, text_response))
     # Prepare output
         # Text input handling
         text_submit.click(
+            fn=lambda text: str(text) if text is not None else "",
             inputs=text_input,
             outputs=[chatbot],
             queue=False
         # Multimodal input handling
         def prepare_multimodal_input(image, audio, video, text):
             # Create a display message that indicates what was uploaded
+            display_message = str(text) if text is not None else ""
             if image is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Image]"
             if audio is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Audio]"
             if video is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Video]"
+            if not display_message.strip():
                 display_message = "Multimodal content"
             return display_message