Update example snippet

Use the unified transformers processor API to prepare all inputs and apply the chat template in a single call.

Files changed (1) hide show

README.md CHANGED Viewed

@@ -624,9 +624,7 @@ Here we show a code snippet to show you how to use the chat model with `transfor
 ```python
 import soundfile as sf
 from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-from qwen_omni_utils import process_mm_info
 # default: Load the model on the available device(s)
 model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
@@ -652,21 +650,29 @@ conversation = [
         "role": "user",
         "content": [
             {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
         ],
     },
 ]
-# set use audio in video
-USE_AUDIO_IN_VIDEO = True
-# Preparation for inference
-text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-inputs = inputs.to(model.device).to(model.dtype)
 # Inference: Generation of the output text and audio
-text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
 text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(text)

 ```python
 import soundfile as sf
 from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
 # default: Load the model on the available device(s)
 model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
         "role": "user",
         "content": [
             {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
+            {"type": "text", "text": "What can you hear and see in this video?"},
         ],
     },
 ]
+# Prepare inputs for inference
+inputs = processor.apply_chat_template(
+    conversation,
+    load_audio_from_video=True,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    video_fps=2,
+    # kwargs to be passed to `Qwen2-5-OmniProcessor`
+    padding=True,
+    use_audio_in_video=True,
+)
 # Inference: Generation of the output text and audio
+# Generation params for audio or text can be different and have to be prefixed with `thinker_` or `talker_`
+text_ids, audio = model.generate(**inputs, use_audio_in_video=True, thinker_do_sample=False, talker_do_sample=True)
 text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 print(text)