Spaces:

chenjoya
/

LiveCC

Running on Zero

App Files Files Community

chenjoya commited on 9 days ago

Commit

d1a4ede

verified ·

1 Parent(s): 292389d

Update demo/infer.py

Browse files

Files changed (1) hide show

demo/infer.py +29 -26

demo/infer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import functools, torch, os, tqdm
 from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
 apply_liger_kernel_to_qwen2_vl()
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
@@ -62,6 +62,7 @@ class LiveCCDemoInfer:
         repetition_penalty: float = 1.05,
         streaming_eos_base_threshold: float = None,
         streaming_eos_threshold_step: float = None,
         **kwargs,
     ):
         """
@@ -83,6 +84,7 @@ class LiveCCDemoInfer:
             state['video_pts'] = torch.from_numpy(video_reader._frame_pts[:, 1])
             state['last_video_pts_index'] = -1
         video_pts = state['video_pts']
         if last_timestamp + self.frame_time_interval > video_pts[-1]:
             state['video_end'] = True
             return
@@ -140,7 +142,7 @@ class LiveCCDemoInfer:
                 return_tensors="pt",
                 return_attention_mask=False
             )
-            inputs.to('cuda')
             if past_ids is not None:
                 inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
             if streaming_eos_base_threshold is not None:
@@ -153,9 +155,11 @@ class LiveCCDemoInfer:
                 repetition_penalty=repetition_penalty,
                 logits_processor=logits_processor,
             )
-            state['past_key_values'] = outputs.past_key_values
-            state['past_ids'] = outputs.sequences[:, :-1]
-            yield (start_timestamp, stop_timestamp), self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True), state
     @torch.inference_mode()
     def video_qa(
@@ -165,7 +169,7 @@ class LiveCCDemoInfer:
         state: dict,
         do_sample: bool = False,
         repetition_penalty: float = 1.05,
-        hf_space: bool = False,
         **kwargs,
     ):
         """
@@ -178,25 +182,24 @@ class LiveCCDemoInfer:
             last_history: list, last processed history
         """
         video_path = state.get('video_path', None)
-        if video_path:
-            message = {
-                "role": "user",
-                "content": [
-                    {"type": "video", "video": video_path},
-                    {"type": "text", "text": query},
-                ],
-            }
         else:
-            message = {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": query},
-                ],
-            }
-        image_inputs, video_inputs = process_vision_info([message])
-        texts = self.processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True, return_tensors='pt')
         past_ids = state.get('past_ids', None)
         if past_ids is not None:
             texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
         inputs = self.processor(
@@ -204,6 +207,7 @@ class LiveCCDemoInfer:
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
         )
         inputs.to(self.model.device)
         if past_ids is not None:
@@ -214,9 +218,8 @@ class LiveCCDemoInfer:
             repetition_penalty=repetition_penalty,
             max_new_tokens=512,
         )
-        state['past_key_values'] = outputs.past_key_values
-        state['past_ids'] = outputs.sequences[:, :-1]
         response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
         print(response)
-        state.pop('past_key_values')
         return response, state

+import functools, torch
 from liger_kernel.transformers import apply_liger_kernel_to_qwen2_vl
 apply_liger_kernel_to_qwen2_vl()
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, LogitsProcessor, logging
         repetition_penalty: float = 1.05,
         streaming_eos_base_threshold: float = None,
         streaming_eos_threshold_step: float = None,
+        hf_spaces: bool = False,
         **kwargs,
     ):
         """
             state['video_pts'] = torch.from_numpy(video_reader._frame_pts[:, 1])
             state['last_video_pts_index'] = -1
         video_pts = state['video_pts']
+        video_timestamp = min(video_timestamp, video_pts[-1])
         if last_timestamp + self.frame_time_interval > video_pts[-1]:
             state['video_end'] = True
             return
                 return_tensors="pt",
                 return_attention_mask=False
             )
+            inputs.to(self.model.device)
             if past_ids is not None:
                 inputs['input_ids'] = torch.cat([past_ids, inputs.input_ids], dim=1)
             if streaming_eos_base_threshold is not None:
                 repetition_penalty=repetition_penalty,
                 logits_processor=logits_processor,
             )
+            state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
+            state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
+            response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
+            print(response)
+            yield (start_timestamp, stop_timestamp), response, state
     @torch.inference_mode()
     def video_qa(
         state: dict,
         do_sample: bool = False,
         repetition_penalty: float = 1.05,
+        hf_spaces: bool = False,
         **kwargs,
     ):
         """
             last_history: list, last processed history
         """
         video_path = state.get('video_path', None)
+        conversation = []
+        if hf_spaces:
+            for past_message in history:
+                content = [{"type": "text", "text": past_message['content']}]
+                if video_path: # only use once
+                    content.insert(0, {"type": "video", "video": video_path})
+                    video_path = None
+                conversation.append({"role": past_message["role"], "content": content})
         else:
+            pass # use past_key_values
         past_ids = state.get('past_ids', None)
+        content = [{"type": "text", "text": message}]
+        if past_ids is None and video_path: # only use once
+            content.insert(0, {"type": "video", "video": video_path})
+        conversation.append({"role": "user", "content": content})
+        print(conversation)
+        image_inputs, video_inputs = process_vision_info(conversation)
+        texts = self.processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, return_tensors='pt')
         if past_ids is not None:
             texts = '<|im_end|>\n' + texts[self.system_prompt_offset:]
         inputs = self.processor(
             images=image_inputs,
             videos=video_inputs,
             return_tensors="pt",
+            return_attention_mask=False
         )
         inputs.to(self.model.device)
         if past_ids is not None:
             repetition_penalty=repetition_penalty,
             max_new_tokens=512,
         )
+        state['past_key_values'] = outputs.past_key_values if not hf_spaces else None
+        state['past_ids'] = outputs.sequences[:, :-1] if not hf_spaces else None
         response = self.processor.decode(outputs.sequences[0, inputs.input_ids.size(1):], skip_special_tokens=True)
         print(response)
         return response, state