pcuenq HF Staff commited on
Commit
618948b
·
verified ·
1 Parent(s): 08f233e

Update example snippet

Browse files

Use the unified transformers processor API to prepare all inputs and apply the chat template in a single call.

Files changed (1) hide show
  1. README.md +17 -11
README.md CHANGED
@@ -624,9 +624,7 @@ Here we show a code snippet to show you how to use the chat model with `transfor
624
 
625
  ```python
626
  import soundfile as sf
627
-
628
  from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
629
- from qwen_omni_utils import process_mm_info
630
 
631
  # default: Load the model on the available device(s)
632
  model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
@@ -652,21 +650,29 @@ conversation = [
652
  "role": "user",
653
  "content": [
654
  {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
 
655
  ],
656
  },
657
  ]
658
 
659
- # set use audio in video
660
- USE_AUDIO_IN_VIDEO = True
661
-
662
- # Preparation for inference
663
- text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
664
- audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
665
- inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
666
- inputs = inputs.to(model.device).to(model.dtype)
 
 
 
 
 
 
667
 
668
  # Inference: Generation of the output text and audio
669
- text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
 
670
 
671
  text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
672
  print(text)
 
624
 
625
  ```python
626
  import soundfile as sf
 
627
  from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
 
628
 
629
  # default: Load the model on the available device(s)
630
  model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
 
650
  "role": "user",
651
  "content": [
652
  {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
653
+ {"type": "text", "text": "What can you hear and see in this video?"},
654
  ],
655
  },
656
  ]
657
 
658
+ # Prepare inputs for inference
659
+ inputs = processor.apply_chat_template(
660
+ conversation,
661
+ load_audio_from_video=True,
662
+ add_generation_prompt=True,
663
+ tokenize=True,
664
+ return_dict=True,
665
+ return_tensors="pt",
666
+ video_fps=2,
667
+
668
+ # kwargs to be passed to `Qwen2-5-OmniProcessor`
669
+ padding=True,
670
+ use_audio_in_video=True,
671
+ )
672
 
673
  # Inference: Generation of the output text and audio
674
+ # Generation params for audio or text can be different and have to be prefixed with `thinker_` or `talker_`
675
+ text_ids, audio = model.generate(**inputs, use_audio_in_video=True, thinker_do_sample=False, talker_do_sample=True)
676
 
677
  text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
678
  print(text)