Spaces:

aimeri
/

Qwen2.5-Omni-7B-Demo

Build error

App Files Files Community

aimeri commited on 25 days ago

Commit

c98fc82

1 Parent(s): 9f83467

Update process_input function in app.py to handle audio generation output more robustly, introducing a fallback mechanism for text generation in case of unexpected output formats. Improve error handling during audio and text generation processes. Additionally, update requirements.txt to include flash-attn for enhanced performance.

Browse files

Files changed (2) hide show

app.py +42 -12
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ def get_model():
         device_map="auto",
         enable_audio_output=True,
         low_cpu_mem_usage=True,
-        # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
     )
     return model
@@ -123,11 +123,12 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
         try:
             text_ids = None
             audio_path = None
             if enable_audio_output:
                 voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
                 try:
-                    text_ids, audio = model.generate(
                         **inputs,
                         use_audio_in_video=False,
                         return_audio=True,
@@ -139,18 +140,47 @@ def process_input(image, audio, video, text, chat_history, voice_type, enable_au
                         streamer=TextStreamer(processor, skip_prompt=True)
                     )
-                    if audio is not None:
-                        # Save audio to temporary file
-                        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                            sf.write(
-                                tmp_file.name,
-                                audio.reshape(-1).detach().cpu().numpy(),
-                                samplerate=24000,
-                            )
-                            audio_path = tmp_file.name
                 except Exception as e:
                     print(f"Error during audio generation: {str(e)}")
-                    text_ids = None
             else:
                 try:
                     text_ids = model.generate(

         device_map="auto",
         enable_audio_output=True,
         low_cpu_mem_usage=True,
+        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
     )
     return model
         try:
             text_ids = None
             audio_path = None
+            generation_output = None
             if enable_audio_output:
                 voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
                 try:
+                    generation_output = model.generate(
                         **inputs,
                         use_audio_in_video=False,
                         return_audio=True,
                         streamer=TextStreamer(processor, skip_prompt=True)
                     )
+                    if generation_output is not None and isinstance(generation_output, tuple) and len(generation_output) == 2:
+                        text_ids, audio = generation_output
+                        if audio is not None:
+                            # Save audio to temporary file
+                            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                                sf.write(
+                                    tmp_file.name,
+                                    audio.reshape(-1).detach().cpu().numpy(),
+                                    samplerate=24000,
+                                )
+                                audio_path = tmp_file.name
+                    else:
+                        print("Warning: Unexpected generation output format")
+                        # Fall back to text-only generation
+                        text_ids = model.generate(
+                            **inputs,
+                            use_audio_in_video=False,
+                            return_audio=False,
+                            max_new_tokens=512,
+                            do_sample=True,
+                            temperature=0.7,
+                            top_p=0.9,
+                            streamer=TextStreamer(processor, skip_prompt=True)
+                        )
                 except Exception as e:
                     print(f"Error during audio generation: {str(e)}")
+                    # Fall back to text-only generation
+                    try:
+                        text_ids = model.generate(
+                            **inputs,
+                            use_audio_in_video=False,
+                            return_audio=False,
+                            max_new_tokens=512,
+                            do_sample=True,
+                            temperature=0.7,
+                            top_p=0.9,
+                            streamer=TextStreamer(processor, skip_prompt=True)
+                        )
+                    except Exception as e:
+                        print(f"Error during fallback text generation: {str(e)}")
+                        text_ids = None
             else:
                 try:
                     text_ids = model.generate(

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ torch
 gradio
 torchvision
 torchaudio
-accelerate

 gradio
 torchvision
 torchaudio
+accelerate
+flash-attn