Spaces:

aimeri
/

Qwen2.5-Omni-7B-Demo

Build error

aimeri commited on 25 days ago

Commit

8ddcd3c

1 Parent(s): e4a9a7a

Refactor process_input and create_demo functions in app.py to enhance chat history management and improve text response handling, including the addition of user and assistant avatars. Update input clearing logic for better user experience with multimodal inputs.

Add binary files using Git LFS

Update .gitattributes to include PNG files in Git LFS and add new binary images for user and assistant avatars.

Update .gitattributes to include PNG files in Git LFS and add new binary images for user and assistant avatars.

Refactor model initialization and process_input function in app.py to improve GPU memory management and enhance error handling during multimodal input processing. Introduce a dedicated get_model function to manage model loading and memory clearing, ensuring efficient resource usage.

Files changed (4) hide show

.gitattributes +1 -0
app.py +181 -132
assistant.png +3 -0
user.png +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -5,19 +5,28 @@ from qwen_omni_utils import process_mm_info
 import soundfile as sf
 import tempfile
 import spaces
 # Initialize the model and processor
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
-model = Qwen2_5OmniModel.from_pretrained(
-    "Qwen/Qwen2.5-Omni-7B",
-    torch_dtype=torch_dtype,
-    device_map="auto",
-    enable_audio_output=True,
-    # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
-)
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
 # System prompt
@@ -34,131 +43,166 @@ VOICE_OPTIONS = {
 @spaces.GPU
 def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
-    # Combine multimodal inputs
-    user_input = {
-        "text": text,
-        "image": image if image is not None else None,
-        "audio": audio if audio is not None else None,
-        "video": video if video is not None else None
-    }
-    # Prepare conversation history for model processing
-    conversation = [SYSTEM_PROMPT]
-    # Add previous chat history
-    if isinstance(chat_history, list):
-        for item in chat_history:
-            if isinstance(item, list) and len(item) == 2:
-                user_msg, bot_msg = item
-                if bot_msg is not None:  # Only add complete message pairs
-                    # Convert display format back to processable format
-                    processed_msg = user_msg
-                    if "[Image]" in user_msg:
-                        processed_msg = {"type": "text", "text": user_msg.replace("[Image]", "").strip()}
-                    if "[Audio]" in user_msg:
-                        processed_msg = {"type": "text", "text": user_msg.replace("[Audio]", "").strip()}
-                    if "[Video]" in user_msg:
-                        processed_msg = {"type": "text", "text": user_msg.replace("[Video]", "").strip()}
-                    conversation.append({"role": "user", "content": processed_msg})
-                    conversation.append({"role": "assistant", "content": bot_msg})
-    else:
-        # Initialize chat history if it's not a list
-        chat_history = []
-    # Add current user input
-    conversation.append({"role": "user", "content": user_input_to_content(user_input)})
-    # Prepare for inference
-    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
     try:
-        audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
-    except AssertionError:
-        # If video doesn't have audio, try without audio processing
-        audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)
-    inputs = processor(
-        text=text,
-        audios=audios,
-        images=images,
-        videos=videos,
-        return_tensors="pt",
-        padding=True
-    )
-    inputs = inputs.to(model.device).to(model.dtype)
-    # Generate response with streaming
-    if enable_audio_output:
-        voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
-        text_ids, audio = model.generate(
-            **inputs,
-            use_audio_in_video=False,  # Set to False to avoid audio processing issues
-            return_audio=True,
-            spk=voice_type_value,
-            max_new_tokens=512,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            streamer=TextStreamer(processor, skip_prompt=True)
-        )
-        # Save audio to temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            sf.write(
-                tmp_file.name,
-                audio.reshape(-1).detach().cpu().numpy(),
-                samplerate=24000,
-            )
-            audio_path = tmp_file.name
-    else:
-        text_ids = model.generate(
-            **inputs,
-            use_audio_in_video=False,  # Set to False to avoid audio processing issues
-            return_audio=False,
-            max_new_tokens=512,
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9,
-            streamer=TextStreamer(processor, skip_prompt=True)
         )
-        audio_path = None
-    # Decode text response
-    text_response = processor.batch_decode(
-        text_ids,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False
-    )[0]
-    # Clean up text response by removing system/user messages
-    text_response = text_response.strip()
-    text_response = text_response.split("assistant")[-1].strip()
-    if text_response.startswith(":"):
-        text_response = text_response[1:].strip()
-    # Format user message for chat history display
-    user_message_for_display = str(text) if text is not None else ""
-    if image is not None:
-        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Image]"
-    if audio is not None:
-        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Audio]"
-    if video is not None:
-        user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Video]"
-    # If empty, provide a default message
-    if not user_message_for_display.strip():
-        user_message_for_display = "Multimodal input"
-    # Update chat history with properly formatted entries
-    if not isinstance(chat_history, list):
-        chat_history = []
-    chat_history.append([user_message_for_display, text_response])
-    # Prepare output
-    if enable_audio_output and audio_path:
-        return chat_history, text_response, audio_path
-    else:
-        return chat_history, text_response, None
 def user_input_to_content(user_input):
     if isinstance(user_input, str):
@@ -193,8 +237,7 @@ def create_demo():
                 chatbot = gr.Chatbot(
                     height=600,
                     show_label=False,
-                    avatar_images=["👤", "🤖"],
-                    bubble_full_width=False,
                 )
                 with gr.Accordion("Advanced Options", open=False):
                     voice_type = gr.Dropdown(
@@ -277,7 +320,7 @@ def create_demo():
         # Text input handling
         text_submit.click(
-            fn=lambda text: [[str(text) if text is not None else "", None]],
             inputs=text_input,
             outputs=[chatbot],
             queue=False
@@ -285,6 +328,9 @@ def create_demo():
             fn=process_input,
             inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
         )
         # Multimodal input handling
@@ -313,6 +359,9 @@ def create_demo():
             inputs=[image_input, audio_input, video_input, additional_text,
                    chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
         )
         # Clear chat

 import soundfile as sf
 import tempfile
 import spaces
+import gc
 # Initialize the model and processor
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
+def get_model():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+    model = Qwen2_5OmniModel.from_pretrained(
+        "Qwen/Qwen2.5-Omni-7B",
+        torch_dtype=torch_dtype,
+        device_map="auto",
+        enable_audio_output=True,
+        low_cpu_mem_usage=True,
+        # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
+    )
+    return model
+model = get_model()
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
 # System prompt
 @spaces.GPU
 def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
     try:
+        # Clear GPU memory before processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        # Combine multimodal inputs
+        user_input = {
+            "text": text,
+            "image": image if image is not None else None,
+            "audio": audio if audio is not None else None,
+            "video": video if video is not None else None
+        }
+        # Prepare conversation history for model processing
+        conversation = [SYSTEM_PROMPT]
+        # Add previous chat history
+        if isinstance(chat_history, list):
+            for item in chat_history:
+                if isinstance(item, list) and len(item) == 2:
+                    user_msg, bot_msg = item
+                    if bot_msg is not None:  # Only add complete message pairs
+                        # Convert display format back to processable format
+                        processed_msg = user_msg
+                        if "[Image]" in user_msg:
+                            processed_msg = {"type": "text", "text": user_msg.replace("[Image]", "").strip()}
+                        if "[Audio]" in user_msg:
+                            processed_msg = {"type": "text", "text": user_msg.replace("[Audio]", "").strip()}
+                        if "[Video]" in user_msg:
+                            processed_msg = {"type": "text", "text": user_msg.replace("[Video]", "").strip()}
+                        conversation.append({"role": "user", "content": processed_msg})
+                        conversation.append({"role": "assistant", "content": bot_msg})
+        # Add current user input
+        conversation.append({"role": "user", "content": user_input_to_content(user_input)})
+        # Prepare for inference
+        model_input = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        try:
+            audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)  # Default to no audio in video
+        except Exception as e:
+            print(f"Error processing multimedia: {str(e)}")
+            audios, images, videos = [], [], []  # Fallback to empty lists
+        inputs = processor(
+            text=model_input,
+            audios=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True
         )
+        # Move inputs to device and convert dtype
+        inputs = {k: v.to(device=model.device, dtype=model.dtype) if isinstance(v, torch.Tensor) else v
+                 for k, v in inputs.items()}
+        # Generate response with streaming
+        try:
+            if enable_audio_output:
+                voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
+                text_ids, audio = model.generate(
+                    **inputs,
+                    use_audio_in_video=False,  # Set to False to avoid audio processing issues
+                    return_audio=True,
+                    spk=voice_type_value,
+                    max_new_tokens=512,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    streamer=TextStreamer(processor, skip_prompt=True)
+                )
+                # Save audio to temporary file
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                    sf.write(
+                        tmp_file.name,
+                        audio.reshape(-1).detach().cpu().numpy(),
+                        samplerate=24000,
+                    )
+                    audio_path = tmp_file.name
+            else:
+                text_ids = model.generate(
+                    **inputs,
+                    use_audio_in_video=False,  # Set to False to avoid audio processing issues
+                    return_audio=False,
+                    max_new_tokens=512,
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.9,
+                    streamer=TextStreamer(processor, skip_prompt=True)
+                )
+                audio_path = None
+            # Decode text response
+            text_response = processor.batch_decode(
+                text_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )[0]
+            # Clean up text response by removing system/user messages and special tokens
+            text_response = text_response.strip()
+            # Remove everything before the last assistant's message
+            if "<|im_start|>assistant" in text_response:
+                text_response = text_response.split("<|im_start|>assistant")[-1]
+            # Remove any remaining special tokens
+            text_response = text_response.replace("<|im_end|>", "").replace("<|im_start|>", "")
+            if text_response.startswith(":"):
+                text_response = text_response[1:].strip()
+            # Format user message for chat history display
+            user_message_for_display = str(text) if text is not None else ""
+            if image is not None:
+                user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Image]"
+            if audio is not None:
+                user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Audio]"
+            if video is not None:
+                user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Video]"
+            # If empty, provide a default message
+            if not user_message_for_display.strip():
+                user_message_for_display = "Multimodal input"
+            # Update chat history with properly formatted entries
+            if not isinstance(chat_history, list):
+                chat_history = []
+            # Find the last incomplete message pair if it exists
+            if chat_history and isinstance(chat_history[-1], list) and len(chat_history[-1]) == 2 and chat_history[-1][1] is None:
+                chat_history[-1][1] = text_response
+            else:
+                chat_history.append([user_message_for_display, text_response])
+            # Clear GPU memory after processing
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                gc.collect()
+            # Prepare output
+            if enable_audio_output and audio_path:
+                return chat_history, text_response, audio_path
+            else:
+                return chat_history, text_response, None
+        except Exception as e:
+            print(f"Error during generation: {str(e)}")
+            error_msg = "I apologize, but I encountered an error processing your request. Please try again."
+            chat_history.append([user_message_for_display, error_msg])
+            return chat_history, error_msg, None
+    except Exception as e:
+        print(f"Error in process_input: {str(e)}")
+        if not isinstance(chat_history, list):
+            chat_history = []
+        error_msg = "I apologize, but I encountered an error processing your request. Please try again."
+        chat_history.append([str(text) if text is not None else "Error", error_msg])
+        return chat_history, error_msg, None
 def user_input_to_content(user_input):
     if isinstance(user_input, str):
                 chatbot = gr.Chatbot(
                     height=600,
                     show_label=False,
+                    avatar_images=["user.png", "assistant.png"]
                 )
                 with gr.Accordion("Advanced Options", open=False):
                     voice_type = gr.Dropdown(
         # Text input handling
         text_submit.click(
+            fn=lambda text: [[text if text is not None else "", None]],
             inputs=text_input,
             outputs=[chatbot],
             queue=False
             fn=process_input,
             inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
+        ).then(
+            fn=lambda: "",  # Clear input after submission
+            outputs=text_input
         )
         # Multimodal input handling
             inputs=[image_input, audio_input, video_input, additional_text,
                    chatbot, voice_type, enable_audio_output],
             outputs=[chatbot, text_output, audio_output]
+        ).then(
+            fn=lambda: (None, None, None, ""),  # Clear inputs after submission
+            outputs=[image_input, audio_input, video_input, additional_text]
         )
         # Clear chat

assistant.png ADDED Viewed

Git LFS Details

SHA256: b1ae5b5af8ab5b2ade90759a4f22383c796b7dbc7cc60ef9569a5e793edcf280
Pointer size: 131 Bytes
Size of remote file: 687 kB

user.png ADDED Viewed

Git LFS Details

SHA256: 88c7c82d7c8682dc1a140c3173034b62c9e2fc7c1cbe48d31d6fa427346dc89f
Pointer size: 131 Bytes
Size of remote file: 577 kB