Update example snippet
Browse filesUse the unified transformers processor API to prepare all inputs and apply the chat template in a single call.
README.md
CHANGED
@@ -624,9 +624,7 @@ Here we show a code snippet to show you how to use the chat model with `transfor
|
|
624 |
|
625 |
```python
|
626 |
import soundfile as sf
|
627 |
-
|
628 |
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
|
629 |
-
from qwen_omni_utils import process_mm_info
|
630 |
|
631 |
# default: Load the model on the available device(s)
|
632 |
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
|
@@ -652,21 +650,29 @@ conversation = [
|
|
652 |
"role": "user",
|
653 |
"content": [
|
654 |
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
|
|
|
655 |
],
|
656 |
},
|
657 |
]
|
658 |
|
659 |
-
#
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
667 |
|
668 |
# Inference: Generation of the output text and audio
|
669 |
-
|
|
|
670 |
|
671 |
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
672 |
print(text)
|
|
|
624 |
|
625 |
```python
|
626 |
import soundfile as sf
|
|
|
627 |
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
|
|
|
628 |
|
629 |
# default: Load the model on the available device(s)
|
630 |
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
|
|
|
650 |
"role": "user",
|
651 |
"content": [
|
652 |
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
|
653 |
+
{"type": "text", "text": "What can you hear and see in this video?"},
|
654 |
],
|
655 |
},
|
656 |
]
|
657 |
|
658 |
+
# Prepare inputs for inference
|
659 |
+
inputs = processor.apply_chat_template(
|
660 |
+
conversation,
|
661 |
+
load_audio_from_video=True,
|
662 |
+
add_generation_prompt=True,
|
663 |
+
tokenize=True,
|
664 |
+
return_dict=True,
|
665 |
+
return_tensors="pt",
|
666 |
+
video_fps=2,
|
667 |
+
|
668 |
+
# kwargs to be passed to `Qwen2-5-OmniProcessor`
|
669 |
+
padding=True,
|
670 |
+
use_audio_in_video=True,
|
671 |
+
)
|
672 |
|
673 |
# Inference: Generation of the output text and audio
|
674 |
+
# Generation params for audio or text can be different and have to be prefixed with `thinker_` or `talker_`
|
675 |
+
text_ids, audio = model.generate(**inputs, use_audio_in_video=True, thinker_do_sample=False, talker_do_sample=True)
|
676 |
|
677 |
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
678 |
print(text)
|