aimeri commited on
Commit
039d869
·
1 Parent(s): ca99eca

Add application file

Browse files
Files changed (3) hide show
  1. README.md +36 -1
  2. app.py +265 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -11,4 +11,39 @@ license: mit
11
  short_description: A space exploring omni modality capabilities
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: A space exploring omni modality capabilities
12
  ---
13
 
14
+ # Qwen2.5-Omni Multimodal Chat Demo
15
+
16
+ This Space demonstrates the capabilities of Qwen2.5-Omni, an end-to-end multimodal model that can perceive and generate text, images, audio, and video.
17
+
18
+ ## Features
19
+
20
+ - **Omni-modal Understanding**: Process text, images, audio, and video inputs
21
+ - **Multimodal Responses**: Generate both text and natural speech outputs
22
+ - **Real-time Interaction**: Stream responses as they're generated
23
+ - **Customizable Voice**: Choose between male and female voice outputs
24
+
25
+ ## How to Use
26
+
27
+ 1. **Text Input**: Type your message in the text box and click "Send Text"
28
+ 2. **Multimodal Input**:
29
+ - Upload images, audio files, or videos
30
+ - Optionally add accompanying text
31
+ - Click "Send Multimodal Input"
32
+ 3. **Voice Settings**:
33
+ - Toggle audio output on/off
34
+ - Select preferred voice type
35
+
36
+ ## Examples
37
+
38
+ Try these interactions:
39
+ - Upload an image and ask "Describe what you see"
40
+ - Upload an audio clip and ask "What is being said here?"
41
+ - Upload a video and ask "What's happening in this video?"
42
+ - Ask complex questions like "Explain quantum computing in simple terms"
43
+
44
+ ## Technical Details
45
+
46
+ This demo uses:
47
+ - Qwen2.5-Omni-7B model
48
+ - FlashAttention-2 for accelerated inference
49
+ - Gradio for the interactive interface
app.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
4
+ from qwen_omni_utils import process_mm_info
5
+ import soundfile as sf
6
+ import os
7
+ from datetime import datetime
8
+ import tempfile
9
+ import base64
10
+
11
+ # Initialize the model and processor
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
14
+
15
+ model = Qwen2_5OmniModel.from_pretrained(
16
+ "Qwen/Qwen2.5-Omni-7B",
17
+ torch_dtype=torch_dtype,
18
+ device_map="auto",
19
+ enable_audio_output=True,
20
+ attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
21
+ )
22
+
23
+ processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
24
+
25
+ # System prompt
26
+ SYSTEM_PROMPT = {
27
+ "role": "system",
28
+ "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
29
+ }
30
+
31
+ # Voice options
32
+ VOICE_OPTIONS = {
33
+ "Chelsie (Female)": "Chelsie",
34
+ "Ethan (Male)": "Ethan"
35
+ }
36
+
37
+ def process_input(user_input, chat_history, voice_type, enable_audio_output):
38
+ # Prepare conversation history
39
+ conversation = [SYSTEM_PROMPT]
40
+
41
+ # Add previous chat history
42
+ for user_msg, bot_msg in chat_history:
43
+ conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
44
+ conversation.append({"role": "assistant", "content": bot_msg})
45
+
46
+ # Add current user input
47
+ conversation.append({"role": "user", "content": user_input_to_content(user_input)})
48
+
49
+ # Prepare for inference
50
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
51
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
52
+
53
+ inputs = processor(
54
+ text=text,
55
+ audios=audios,
56
+ images=images,
57
+ videos=videos,
58
+ return_tensors="pt",
59
+ padding=True
60
+ )
61
+ inputs = inputs.to(model.device).to(model.dtype)
62
+
63
+ # Generate response
64
+ if enable_audio_output:
65
+ text_ids, audio = model.generate(
66
+ **inputs,
67
+ use_audio_in_video=True,
68
+ return_audio=True,
69
+ spk=voice_type
70
+ )
71
+
72
+ # Save audio to temporary file
73
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
74
+ sf.write(
75
+ tmp_file.name,
76
+ audio.reshape(-1).detach().cpu().numpy(),
77
+ samplerate=24000,
78
+ )
79
+ audio_path = tmp_file.name
80
+ else:
81
+ text_ids = model.generate(
82
+ **inputs,
83
+ use_audio_in_video=True,
84
+ return_audio=False
85
+ )
86
+ audio_path = None
87
+
88
+ # Decode text response
89
+ text_response = processor.batch_decode(
90
+ text_ids,
91
+ skip_special_tokens=True,
92
+ clean_up_tokenization_spaces=False
93
+ )[0]
94
+
95
+ # Clean up text response
96
+ text_response = text_response.strip()
97
+
98
+ # Update chat history
99
+ chat_history.append((user_input, text_response))
100
+
101
+ # Prepare output
102
+ if enable_audio_output and audio_path:
103
+ return chat_history, text_response, audio_path
104
+ else:
105
+ return chat_history, text_response, None
106
+
107
+ def user_input_to_content(user_input):
108
+ if isinstance(user_input, str):
109
+ return user_input
110
+ elif isinstance(user_input, dict):
111
+ # Handle file uploads
112
+ content = []
113
+ if "text" in user_input and user_input["text"]:
114
+ content.append({"type": "text", "text": user_input["text"]})
115
+ if "image" in user_input and user_input["image"]:
116
+ content.append({"type": "image", "image": user_input["image"]})
117
+ if "audio" in user_input and user_input["audio"]:
118
+ content.append({"type": "audio", "audio": user_input["audio"]})
119
+ if "video" in user_input and user_input["video"]:
120
+ content.append({"type": "video", "video": user_input["video"]})
121
+ return content
122
+ return user_input
123
+
124
+ def create_demo():
125
+ with gr.Blocks(title="Qwen2.5-Omni Chat Demo", theme=gr.themes.Soft()) as demo:
126
+ gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
127
+ gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
128
+
129
+ # Chat interface
130
+ with gr.Row():
131
+ with gr.Column(scale=3):
132
+ chatbot = gr.Chatbot(height=600)
133
+ with gr.Accordion("Advanced Options", open=False):
134
+ voice_type = gr.Dropdown(
135
+ choices=list(VOICE_OPTIONS.keys()),
136
+ value="Chelsie (Female)",
137
+ label="Voice Type"
138
+ )
139
+ enable_audio_output = gr.Checkbox(
140
+ value=True,
141
+ label="Enable Audio Output"
142
+ )
143
+
144
+ # Multimodal input components
145
+ with gr.Tabs():
146
+ with gr.TabItem("Text Input"):
147
+ text_input = gr.Textbox(
148
+ placeholder="Type your message here...",
149
+ label="Text Input"
150
+ )
151
+ text_submit = gr.Button("Send Text")
152
+
153
+ with gr.TabItem("Multimodal Input"):
154
+ with gr.Row():
155
+ image_input = gr.Image(
156
+ type="filepath",
157
+ label="Upload Image"
158
+ )
159
+ audio_input = gr.Audio(
160
+ type="filepath",
161
+ label="Upload Audio"
162
+ )
163
+ with gr.Row():
164
+ video_input = gr.Video(
165
+ label="Upload Video"
166
+ )
167
+ additional_text = gr.Textbox(
168
+ placeholder="Additional text message...",
169
+ label="Additional Text"
170
+ )
171
+ multimodal_submit = gr.Button("Send Multimodal Input")
172
+
173
+ clear_button = gr.Button("Clear Chat")
174
+
175
+ with gr.Column(scale=1):
176
+ gr.Markdown("## Model Capabilities")
177
+ gr.Markdown("""
178
+ **Qwen2.5-Omni can:**
179
+ - Process and understand text
180
+ - Analyze images and answer questions about them
181
+ - Transcribe and understand audio
182
+ - Analyze video content (with or without audio)
183
+ - Generate natural speech responses
184
+ """)
185
+
186
+ gr.Markdown("### Example Prompts")
187
+ gr.Examples(
188
+ examples=[
189
+ ["Describe what you see in this image", "image"],
190
+ ["What is being said in this audio clip?", "audio"],
191
+ ["What's happening in this video?", "video"],
192
+ ["Explain quantum computing in simple terms", "text"],
193
+ ["Generate a short story about a robot learning to paint", "text"]
194
+ ],
195
+ inputs=[text_input, gr.Textbox(visible=False)],
196
+ label="Text Examples"
197
+ )
198
+
199
+ audio_output = gr.Audio(
200
+ label="Model Speech Output",
201
+ visible=True,
202
+ autoplay=True
203
+ )
204
+ text_output = gr.Textbox(
205
+ label="Model Text Response",
206
+ interactive=False
207
+ )
208
+
209
+ # Text input handling
210
+ text_submit.click(
211
+ fn=lambda text: {"text": text},
212
+ inputs=text_input,
213
+ outputs=[chatbot],
214
+ queue=False
215
+ ).then(
216
+ fn=process_input,
217
+ inputs=[text_input, chatbot, voice_type, enable_audio_output],
218
+ outputs=[chatbot, text_output, audio_output]
219
+ )
220
+
221
+ # Multimodal input handling
222
+ def prepare_multimodal_input(image, audio, video, text):
223
+ return {
224
+ "text": text,
225
+ "image": image,
226
+ "audio": audio,
227
+ "video": video
228
+ }
229
+
230
+ multimodal_submit.click(
231
+ fn=prepare_multimodal_input,
232
+ inputs=[image_input, audio_input, video_input, additional_text],
233
+ outputs=[chatbot],
234
+ queue=False
235
+ ).then(
236
+ fn=process_input,
237
+ inputs=[{"image": image_input, "audio": audio_input, "video": video_input, "text": additional_text},
238
+ chatbot, voice_type, enable_audio_output],
239
+ outputs=[chatbot, text_output, audio_output]
240
+ )
241
+
242
+ # Clear chat
243
+ def clear_chat():
244
+ return [], None, None
245
+
246
+ clear_button.click(
247
+ fn=clear_chat,
248
+ outputs=[chatbot, text_output, audio_output]
249
+ )
250
+
251
+ # Update audio output visibility
252
+ def toggle_audio_output(enable_audio):
253
+ return gr.Audio(visible=enable_audio)
254
+
255
+ enable_audio_output.change(
256
+ fn=toggle_audio_output,
257
+ inputs=enable_audio_output,
258
+ outputs=audio_output
259
+ )
260
+
261
+ return demo
262
+
263
+ if __name__ == "__main__":
264
+ demo = create_demo()
265
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers @ git+https://github.com/huggingface/transformers@3a1ead0aabed473eafe527915eea8c197d424356
2
+ qwen-omni-utils[decord]
3
+ soundfile
4
+ torch
5
+ gradio
6
+ flash-attn --no-build-isolation